Statistics
| Revision:

svn-gvsig-desktop / trunk / org.gvsig.desktop / org.gvsig.desktop.library / org.gvsig.utils / src / main / java / org / gvsig / utils / xml / XMLEncodingUtils.java @ 40561

History | View | Annotate | Download (6.3 KB)

1
/**
2
 * gvSIG. Desktop Geographic Information System.
3
 *
4
 * Copyright (C) 2007-2013 gvSIG Association.
5
 *
6
 * This program is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU General Public License
8
 * as published by the Free Software Foundation; either version 3
9
 * of the License, or (at your option) any later version.
10
 *
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with this program; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19
 * MA  02110-1301, USA.
20
 *
21
 * For any additional information, do not hesitate to contact us
22
 * at info AT gvsig.com, or visit our website www.gvsig.com.
23
 */
24
package org.gvsig.utils.xml;
25

    
26
import java.io.BufferedInputStream;
27
import java.io.File;
28
import java.io.FileInputStream;
29
import java.io.FileNotFoundException;
30
import java.io.IOException;
31
import java.io.InputStream;
32
import java.io.InputStreamReader;
33
import java.io.UnsupportedEncodingException;
34

    
35
/**
36
 * A set of methods to detect XML encoding. The class is able to autodetect
37
 * certain encodings, and it reads the XML header for the rest of encodings.
38
 * 
39
 * @author C?sar Mart?nez Izquierdo <cesar.martinez@iver.es>
40
 *
41
 */
42
public class XMLEncodingUtils {
43
        InputStream _is;
44
        
45
        /**
46
         * Creates a new XMLEncodingUtils object.
47
         * 
48
         * @param is An InputStream connected to the XML file to process.
49
         */
50
        public XMLEncodingUtils(InputStream is) {
51
                if (is == null)
52
                        throw new IllegalArgumentException();
53
                _is = is;
54
        }
55
        
56
        /**
57
         * Gets the encoding of the XML file.
58
         * 
59
         * The following encodings can be detected: UTF-32BE, UTF-32LE,
60
         * UTF-16BE, UTF-16-LE, UTF-8. The rest of the encodings are
61
         * read from the XML header.
62
         * 
63
         * @return Returns the encoding of the XML file, or null if the
64
         * encoding couldn't be correctly detected or read from the XML
65
         * header.
66
         */
67
        public String getEncoding() {
68
                int srcCount = 0;
69
                String enc=null;
70
                char[] srcBuf = new char[128];
71
                
72
                // read four bytes 
73
                int chk = 0;
74
                try {
75
                        while (srcCount < 4) {
76
                                int i = _is.read();
77
                                if (i == -1)
78
                                        break;
79
                                chk = (chk << 8) | i;
80
                                srcBuf[srcCount++] = (char) i;
81
                        }
82
                        
83
                        if (srcCount == 4) {
84
                                switch (chk) {
85
                                case 0x00000FEFF :
86
                                        enc = "UTF-32BE";
87
                                        srcCount = 0;
88
                                        break;
89
                                        
90
                                case 0x0FFFE0000 :
91
                                        enc = "UTF-32LE";
92
                                        srcCount = 0;
93
                                        break;
94
                                        
95
                                case 0x03c :
96
                                        enc = "UTF-32BE";
97
                                        srcBuf[0] = '<';
98
                                        srcCount = 1;
99
                                        break;
100
                                        
101
                                case 0x03c000000 :
102
                                        enc = "UTF-32LE";
103
                                        srcBuf[0] = '<';
104
                                        srcCount = 1;
105
                                        break;
106
                                        
107
                                case 0x0003c003f :
108
                                        enc = "UTF-16BE";
109
                                        srcBuf[0] = '<';
110
                                        srcBuf[1] = '?';
111
                                        srcCount = 2;
112
                                        break;
113
                                        
114
                                case 0x03c003f00 :
115
                                        enc = "UTF-16LE";
116
                                        srcBuf[0] = '<';
117
                                        srcBuf[1] = '?';
118
                                        srcCount = 2;
119
                                        break;
120
                                        
121
                                case 0x03c3f786d :
122
                                        while (true) {
123
                                                int i = _is.read();
124
                                                if (i == -1)
125
                                                        break;
126
                                                srcBuf[srcCount++] = (char) i;
127
                                                if (i == '>') {
128
                                                        String s = new String(srcBuf, 0, srcCount);
129
                                                        int i0 = s.indexOf("encoding");
130
                                                        if (i0 != -1) {
131
                                                                while (s.charAt(i0) != '"'
132
                                                                        && s.charAt(i0) != '\'')
133
                                                                        i0++;
134
                                                                char deli = s.charAt(i0++);
135
                                                                int i1 = s.indexOf(deli, i0);
136
                                                                enc = s.substring(i0, i1);
137
                                                        }
138
                                                        break;
139
                                                }
140
                                        }
141
                                        
142
                                default :
143
                                        if ((chk & 0x0ffff0000) == 0x0FEFF0000) {
144
                                                enc = "UTF-16BE";
145
                                                srcBuf[0] =
146
                                                        (char) ((srcBuf[2] << 8) | srcBuf[3]);
147
                                                srcCount = 1;
148
                                        }
149
                                        else if ((chk & 0x0ffff0000) == 0x0fffe0000) {
150
                                                enc = "UTF-16LE";
151
                                                srcBuf[0] =
152
                                                        (char) ((srcBuf[3] << 8) | srcBuf[2]);
153
                                                srcCount = 1;
154
                                        }
155
                                        else if ((chk & 0x0ffffff00) == 0x0EFBBBF00) {
156
                                                enc = "UTF-8";
157
                                                srcBuf[0] = srcBuf[3];
158
                                                srcCount = 1;
159
                                        }
160
                                }
161
                        }
162
                }
163
                catch (IOException ex) {
164
                        return null;
165
                }
166
                return enc;
167
        }
168
        
169
        /**
170
         * Gets an InputStreamReader for the provided XML file.
171
         * The reader uses the right encoding, as specified in
172
         * the XML header (or autodetected). 
173
         * 
174
         * @return A reader which uses the right encoding, or null
175
         * if the encoding couldn't be correctly detected or read
176
         * from the XML header.
177
         */
178
        public InputStreamReader getReader() {
179
                String encoding = getEncoding();
180
                if (encoding==null)
181
                        return null;
182
                try {
183
                        return new InputStreamReader(_is, encoding);
184
                } catch (UnsupportedEncodingException e) {
185
                        return null;
186
                }
187
        }
188
        
189
        /**
190
         * Gets an InputStreamReader for the provided XML file.
191
         * The reader uses the right encoding, as specified in
192
         * the XML header (or autodetected).
193
         * 
194
         * @param is An InputStream connected to the XML file to process
195
         * @return A reader for the provided XML file.
196
         * @see getReader()
197
         */
198
        public static InputStreamReader getReader(InputStream is) {
199
                XMLEncodingUtils util = new XMLEncodingUtils(is);
200
                return util.getReader();
201
        }
202

    
203
        /**
204
         * Gets the character encoding of the XML file.
205
         * 
206
         * @param is An InputStream connected to the XML file to process
207
         * @see getEncoding()
208
         * @return The encoding of the file
209
         */
210
        public static String getEncoding(InputStream is) {
211
                XMLEncodingUtils util = new XMLEncodingUtils(is);
212
                return util.getEncoding();
213
        }
214
        
215
        /**
216
         * Gets an InputStreamReader for the provided XML file.
217
         * The reader uses the right encoding, as specified in
218
         * the XML header (or autodetected).
219
         * 
220
         * @param file The XML file to process
221
         * @return A reader for the provided XML file.
222
         * @see getReader()
223
         */
224
        public static InputStreamReader getReader(File file) throws FileNotFoundException {
225
                BufferedInputStream bs = new BufferedInputStream(new FileInputStream(file));
226
                XMLEncodingUtils util = new XMLEncodingUtils(bs);
227
                return util.getReader();
228
        }
229
        
230
        /**
231
         * Gets the character encoding of the XML file.
232
         * 
233
         * @param File The XML file to process
234
         * @see getEncoding()
235
         * @return The encoding of the file
236
         * @throws FileNotFoundException 
237
         */
238
        public static String getEncoding(File file) throws FileNotFoundException {
239
                BufferedInputStream bs = new BufferedInputStream(new FileInputStream(file));
240
                XMLEncodingUtils util = new XMLEncodingUtils(bs);
241
                return util.getEncoding();
242
        }
243
}