svn-gvsig-desktop / branches / org.gvsig.desktop-2018a / org.gvsig.desktop.compat.cdc / org.gvsig.fmap.dal / org.gvsig.fmap.dal.file / org.gvsig.fmap.dal.file.bsq / src / main / java / org / gvsig / fmap / dal / fileutils / impl / DefaultCPGFile.java @ 43867
History | View | Annotate | Download (7.65 KB)
1 |
package org.gvsig.fmap.dal.fileutils.impl; |
---|---|
2 |
|
3 |
import java.io.File; |
4 |
import java.io.IOException; |
5 |
import org.apache.commons.io.FileUtils; |
6 |
import org.apache.commons.io.FilenameUtils; |
7 |
import org.apache.commons.lang3.StringUtils; |
8 |
import org.slf4j.Logger; |
9 |
import org.slf4j.LoggerFactory; |
10 |
import org.gvsig.fmap.dal.fileutils.CPGFile; |
11 |
|
12 |
public class DefaultCPGFile implements CPGFile { |
13 |
|
14 |
private static final Logger logger = LoggerFactory.getLogger(DefaultPRJFile.class); |
15 |
|
16 |
|
17 |
private File source; |
18 |
private String charsetName = null; |
19 |
|
20 |
/**
|
21 |
* Define the valid code pages (equivalent to MSDOS code pages).
|
22 |
* This codes are used on the byte 29 of the DBF header to define the DBF
|
23 |
* codepage.
|
24 |
*
|
25 |
* The equivalences of these charsets using Java NIO charset names are
|
26 |
* defined on the {@link #charsetNames} array (so 0x01 is
|
27 |
* equivalent to IBM437, 0x02 to IBM850, etc)
|
28 |
*
|
29 |
* See some other equivalences in:
|
30 |
* https://github.com/infused/dbf/blob/master/docs/supported_encodings.csv
|
31 |
* https://github.com/olemb/dbfread/blob/master/dbfread/codepages.py
|
32 |
* https://joinup.ec.europa.eu/svn/gvsig-desktop/trunk/libraries/libFMap/src/com/iver/cit/gvsig/fmap/drivers/dbf/DbfEncodings.java
|
33 |
*/
|
34 |
private static final short[] codePages = { |
35 |
0x01, 0x02, 0x03, 0x04, |
36 |
0x08, 0x09, 0x0a, 0x0b, |
37 |
0x0d, 0x0e, 0x0f, 0x10, |
38 |
0x11, 0x12, 0x13, 0x14, |
39 |
0x15, 0x16, 0x17, 0x18, |
40 |
0x19, 0x1a, 0x1b, 0x1c, |
41 |
0x1d, 0x1f, 0x22, 0x23, |
42 |
0x24, 0x25, 0x26, 0x37, |
43 |
0x40, 0x4d, 0x4e, 0x4f, |
44 |
0x50, 0x57, 0x58, 0x59, |
45 |
0x64, 0x65, 0x66, 0x67, |
46 |
0x68, 0x69, 0x6a, 0x6b, |
47 |
0x6c, 0x78, 0x79, 0x7a, |
48 |
0x7b, 0x7c, 0x7d, 0x7d, |
49 |
0x86, 0x87, 0x88, 0xc8, |
50 |
0xc9, 0xca, 0xcb, 0xcc}; |
51 |
|
52 |
/**
|
53 |
* Equivalent Java charset names to the code pages defined in
|
54 |
* {@link #codePages}, using Java NIO Charset names (which differ
|
55 |
* from JAVA IO names, see
|
56 |
* https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html)
|
57 |
*/
|
58 |
private static final String[] charsetNames = new String[]{ |
59 |
"IBM437", "IBM850", "windows-1252", "x-MacRoman", |
60 |
"IBM865", "IBM437", "IBM850", "IBM437", |
61 |
"IBM437", "IBM850", "IBM437", "IBM850", |
62 |
"IBM437", "IBM850", "x-IBM943", "IBM850", |
63 |
"IBM437", "IBM850", "IBM865", "IBM437", |
64 |
"IBM437", "IBM850", "IBM437", "IBM863", |
65 |
"IBM850", "IBM852", "IBM852", "IBM852", |
66 |
"IBM860", "IBM850", "IBM866", "IBM850", |
67 |
"IBM852", "x-mswin-936", "x-IBM949", "IBM950", |
68 |
"x-IBM874", "windows-1252", "windows-1252", "windows-1252", |
69 |
"IBM852", "IBM866", "IBM865", "IBM861", |
70 |
// 0x68 and 0x69 are unofficial "Codepage 895 Kamenicky (Czech) MS-DOS" and "Codepage 620 Mazovia (Polish) MS-DOS",
|
71 |
// but there is no Java equivalent
|
72 |
// so we use CP437 which is the closest charset for the latin characters part
|
73 |
"IBM437", "IBM437", "x-IBM737", "IBM857", |
74 |
"IBM863", "x-IBM950", "x-IBM949", "x-mswin-936", |
75 |
"x-IBM942", "x-IBM874", "windows-1255", "windows-1256", |
76 |
"x-IBM737", "IBM852", "IBM857", "windows-1250", |
77 |
"windows-1251", "windows-1254", "windows-1253", "windows-1257"}; |
78 |
|
79 |
public DefaultCPGFile() {
|
80 |
this.charsetName = null; |
81 |
this.source = null; |
82 |
} |
83 |
|
84 |
@SuppressWarnings("OverridableMethodCallInConstructor") |
85 |
public DefaultCPGFile(File file) { |
86 |
this();
|
87 |
read(file); |
88 |
} |
89 |
|
90 |
@Override
|
91 |
public File getFile(File file) { |
92 |
File f = new File(FilenameUtils.removeExtension(file.getAbsolutePath()) + "." + FILE_EXTENSION); |
93 |
return f;
|
94 |
} |
95 |
|
96 |
@Override
|
97 |
public File getFile() { |
98 |
return source;
|
99 |
} |
100 |
|
101 |
@Override
|
102 |
public String getCharsetName() { |
103 |
return this.charsetName; |
104 |
} |
105 |
|
106 |
@Override
|
107 |
public void setCharsetName(String charsetName) { |
108 |
this.charsetName = charsetName;
|
109 |
} |
110 |
|
111 |
@Override
|
112 |
public String toCharsetName(String codePageName) { |
113 |
if (codePageName.equals("UTF8")) { |
114 |
return "UTF-8"; |
115 |
} |
116 |
if (codePageName.equals("SJIS")) { |
117 |
return "Shift_JIS"; |
118 |
} |
119 |
|
120 |
if (StringUtils.isNumeric(codePageName)) {
|
121 |
if (codePageName.startsWith("8859") && codePageName.length() > 4) { |
122 |
return "ISO-8859-" + codePageName.substring(4); |
123 |
} |
124 |
if (codePageName.startsWith("125") && codePageName.length() == 4) { |
125 |
return "windows-" + codePageName; |
126 |
} |
127 |
if (codePageName.length() == 3) { |
128 |
return "IBM-" + codePageName; |
129 |
} |
130 |
for (int i = 0; i < charsetNames.length; i++) { |
131 |
if (charsetNames[i].contains(codePageName)) {
|
132 |
return codePageName;
|
133 |
} |
134 |
} |
135 |
} |
136 |
if (codePageName.equals("65001")) { |
137 |
return "UTF-8"; |
138 |
} |
139 |
return codePageName;
|
140 |
} |
141 |
|
142 |
/**
|
143 |
* Gets the Java NIO charset name equivalent to the provided code page.
|
144 |
* Gets null if the provided code page is not recognised
|
145 |
* as a valid code
|
146 |
*
|
147 |
* @param codePage
|
148 |
* @return
|
149 |
*/
|
150 |
@Override
|
151 |
public String toCharsetName(int codePage) { |
152 |
if (codePage != 0) { |
153 |
for (int i = 0; i < codePages.length; i++) { |
154 |
if (codePages[i] == codePage) {
|
155 |
return charsetNames[i];
|
156 |
} |
157 |
} |
158 |
} |
159 |
return null; |
160 |
} |
161 |
|
162 |
@Override
|
163 |
public String toCPGName(String charsetName) { |
164 |
if (charsetName.startsWith("windows-") |
165 |
|| charsetName.startsWith("ISO-8859")
|
166 |
|| charsetName.startsWith("IBM-")
|
167 |
|| charsetName.startsWith("x-IBM")
|
168 |
|| charsetName.startsWith("x-mswin-")) {
|
169 |
return charsetName.replaceAll("[^\\d]", ""); |
170 |
} |
171 |
if (charsetName.equals("Shift_JIS")) { |
172 |
return "SJIS"; |
173 |
} |
174 |
// For the rest of the charsets, we'll directly write the Java NIO Charset
|
175 |
// Probably they will only be recognized by gvSIG, but it's better than nothing
|
176 |
return charsetName;
|
177 |
} |
178 |
|
179 |
/**
|
180 |
* Returns the code page corresponding to the
|
181 |
* provided charset name
|
182 |
*
|
183 |
* @param charsetName
|
184 |
* @return The code page, or 0x00 if no equivalent code page was found for
|
185 |
* the provided charsetName
|
186 |
*/
|
187 |
@Override
|
188 |
public int toCPG(String charsetName) { |
189 |
for (int i = 0; i < charsetNames.length; i++) { |
190 |
if (charsetNames[i].equals(charsetName)) {
|
191 |
return codePages[i];
|
192 |
} |
193 |
} |
194 |
// default
|
195 |
return 0x00; |
196 |
} |
197 |
|
198 |
@Override
|
199 |
public void read(File file) { |
200 |
File f = this.getFile(file); |
201 |
if (f.exists()) {
|
202 |
try {
|
203 |
String theContents = FileUtils.readFileToString(f);
|
204 |
theContents = StringUtils.trim(theContents); |
205 |
if (StringUtils.isNotEmpty(theContents)) {
|
206 |
String theCharset = toCharsetName(theContents);
|
207 |
this.charsetName = theCharset;
|
208 |
this.source = f.getAbsoluteFile();
|
209 |
} |
210 |
} catch (IOException e) { |
211 |
logger.warn("Couldn't read " + FILE_EXTENSION + " file (" + f.getAbsolutePath() + ").", e); |
212 |
} |
213 |
} |
214 |
} |
215 |
|
216 |
@Override
|
217 |
public void write(File file) throws IOException { |
218 |
File f = this.getFile(file); |
219 |
try {
|
220 |
String export = toCPGName(this.charsetName) + "\n"; |
221 |
FileUtils.writeStringToFile(f, export, "ISO-8859-1"); |
222 |
this.source = f;
|
223 |
} catch (Exception e) { |
224 |
logger.warn("Couldn't write " + FILE_EXTENSION + " file (" + f.getAbsolutePath() + ").", e); |
225 |
throw e;
|
226 |
} |
227 |
} |
228 |
|
229 |
} |