Statistics
| Revision:

svn-gvsig-desktop / branches / org.gvsig.desktop-2018a / org.gvsig.desktop.compat.cdc / org.gvsig.fmap.dal / org.gvsig.fmap.dal.file / org.gvsig.fmap.dal.file.bsq / src / main / java / org / gvsig / fmap / dal / fileutils / impl / DefaultCPGFile.java @ 43867

History | View | Annotate | Download (7.65 KB)

1
package org.gvsig.fmap.dal.fileutils.impl;
2

    
3
import java.io.File;
4
import java.io.IOException;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.io.FilenameUtils;
7
import org.apache.commons.lang3.StringUtils;
8
import org.slf4j.Logger;
9
import org.slf4j.LoggerFactory;
10
import org.gvsig.fmap.dal.fileutils.CPGFile;
11

    
12
public class DefaultCPGFile implements CPGFile {
13

    
14
    private static final Logger logger = LoggerFactory.getLogger(DefaultPRJFile.class);
15

    
16

    
17
    private File source;
18
    private String charsetName = null;
19

    
20
    /**
21
     * Define the valid code pages (equivalent to MSDOS code pages). 
22
     * This codes are used on the byte 29 of the DBF header to define the DBF
23
     * codepage.
24
     *
25
     * The equivalences of these charsets using Java NIO charset names are
26
     * defined on the {@link #charsetNames} array (so 0x01 is
27
     * equivalent to IBM437, 0x02 to IBM850, etc)
28
     *
29
     * See some other equivalences in:
30
     * https://github.com/infused/dbf/blob/master/docs/supported_encodings.csv
31
     * https://github.com/olemb/dbfread/blob/master/dbfread/codepages.py
32
     * https://joinup.ec.europa.eu/svn/gvsig-desktop/trunk/libraries/libFMap/src/com/iver/cit/gvsig/fmap/drivers/dbf/DbfEncodings.java
33
     */
34
    private static final short[] codePages = {
35
        0x01, 0x02, 0x03, 0x04,
36
        0x08, 0x09, 0x0a, 0x0b,
37
        0x0d, 0x0e, 0x0f, 0x10,
38
        0x11, 0x12, 0x13, 0x14,
39
        0x15, 0x16, 0x17, 0x18,
40
        0x19, 0x1a, 0x1b, 0x1c,
41
        0x1d, 0x1f, 0x22, 0x23,
42
        0x24, 0x25, 0x26, 0x37,
43
        0x40, 0x4d, 0x4e, 0x4f,
44
        0x50, 0x57, 0x58, 0x59,
45
        0x64, 0x65, 0x66, 0x67,
46
        0x68, 0x69, 0x6a, 0x6b,
47
        0x6c, 0x78, 0x79, 0x7a,
48
        0x7b, 0x7c, 0x7d, 0x7d,
49
        0x86, 0x87, 0x88, 0xc8,
50
        0xc9, 0xca, 0xcb, 0xcc};
51

    
52
    /**
53
     * Equivalent Java charset names to the code pages defined in
54
     * {@link #codePages}, using Java NIO Charset names (which differ
55
     * from JAVA IO names, see
56
     * https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html)
57
     */
58
    private static final String[] charsetNames = new String[]{
59
        "IBM437", "IBM850", "windows-1252", "x-MacRoman",
60
        "IBM865", "IBM437", "IBM850", "IBM437",
61
        "IBM437", "IBM850", "IBM437", "IBM850",
62
        "IBM437", "IBM850", "x-IBM943", "IBM850",
63
        "IBM437", "IBM850", "IBM865", "IBM437",
64
        "IBM437", "IBM850", "IBM437", "IBM863",
65
        "IBM850", "IBM852", "IBM852", "IBM852",
66
        "IBM860", "IBM850", "IBM866", "IBM850",
67
        "IBM852", "x-mswin-936", "x-IBM949", "IBM950",
68
        "x-IBM874", "windows-1252", "windows-1252", "windows-1252",
69
        "IBM852", "IBM866", "IBM865", "IBM861",
70
        // 0x68 and 0x69 are unofficial "Codepage 895 Kamenicky (Czech) MS-DOS" and "Codepage 620  Mazovia (Polish) MS-DOS",
71
        // but there is no Java equivalent
72
        // so we use CP437 which is the closest charset for the latin characters part
73
        "IBM437", "IBM437", "x-IBM737", "IBM857",
74
        "IBM863", "x-IBM950", "x-IBM949", "x-mswin-936",
75
        "x-IBM942", "x-IBM874", "windows-1255", "windows-1256",
76
        "x-IBM737", "IBM852", "IBM857", "windows-1250",
77
        "windows-1251", "windows-1254", "windows-1253", "windows-1257"};
78

    
79
    public DefaultCPGFile() {
80
        this.charsetName = null;
81
        this.source = null;
82
    }
83

    
84
    @SuppressWarnings("OverridableMethodCallInConstructor")
85
    public DefaultCPGFile(File file) {
86
        this();
87
        read(file);
88
    }
89

    
90
    @Override
91
    public File getFile(File file) {
92
        File f = new File(FilenameUtils.removeExtension(file.getAbsolutePath()) + "." + FILE_EXTENSION);
93
        return f;
94
    }
95

    
96
    @Override
97
    public File getFile() {
98
        return source;
99
    }
100

    
101
    @Override
102
    public String getCharsetName() {
103
        return this.charsetName;
104
    }
105

    
106
    @Override
107
    public void setCharsetName(String charsetName) {
108
        this.charsetName = charsetName;
109
    }
110

    
111
    @Override
112
    public String toCharsetName(String codePageName) {
113
        if (codePageName.equals("UTF8")) {
114
            return "UTF-8";
115
        }
116
        if (codePageName.equals("SJIS")) {
117
            return "Shift_JIS";
118
        }
119

    
120
        if (StringUtils.isNumeric(codePageName)) {
121
            if (codePageName.startsWith("8859") && codePageName.length() > 4) {
122
                return "ISO-8859-" + codePageName.substring(4);
123
            }
124
            if (codePageName.startsWith("125") && codePageName.length() == 4) {
125
                return "windows-" + codePageName;
126
            }
127
            if (codePageName.length() == 3) {
128
                return "IBM-" + codePageName;
129
            }
130
            for (int i = 0; i < charsetNames.length; i++) {
131
                if (charsetNames[i].contains(codePageName)) {
132
                    return codePageName;
133
                }
134
            }
135
        }
136
        if (codePageName.equals("65001")) {
137
            return "UTF-8";
138
        }
139
        return codePageName;
140
    }
141

    
142
    /**
143
     * Gets the Java NIO charset name equivalent to the provided code page.
144
     * Gets null if the provided code page is not recognised
145
     * as a valid code
146
     *
147
     * @param codePage
148
     * @return
149
     */
150
    @Override
151
    public String toCharsetName(int codePage) {
152
        if (codePage != 0) {
153
            for (int i = 0; i < codePages.length; i++) {
154
                if (codePages[i] == codePage) {
155
                    return charsetNames[i];
156
                }
157
            }
158
        }
159
        return null;
160
    }
161

    
162
    @Override
163
    public String toCPGName(String charsetName) {
164
        if (charsetName.startsWith("windows-")
165
                || charsetName.startsWith("ISO-8859")
166
                || charsetName.startsWith("IBM-")
167
                || charsetName.startsWith("x-IBM")
168
                || charsetName.startsWith("x-mswin-")) {
169
            return charsetName.replaceAll("[^\\d]", "");
170
        }
171
        if (charsetName.equals("Shift_JIS")) {
172
            return "SJIS";
173
        }
174
        // For the rest of the charsets, we'll directly write the Java NIO Charset
175
        // Probably they will only be recognized by gvSIG, but it's better than nothing
176
        return charsetName;
177
    }
178

    
179
    /**
180
     * Returns the code page corresponding to the
181
     * provided charset name
182
     *
183
     * @param charsetName
184
     * @return The code page, or 0x00 if no equivalent code page was found for
185
     * the provided charsetName
186
     */
187
    @Override
188
    public int toCPG(String charsetName) {
189
        for (int i = 0; i < charsetNames.length; i++) {
190
            if (charsetNames[i].equals(charsetName)) {
191
                return codePages[i];
192
            }
193
        }
194
        // default
195
        return 0x00;
196
    }
197

    
198
    @Override
199
    public void read(File file) {
200
        File f = this.getFile(file);
201
        if (f.exists()) {
202
            try {
203
                String theContents = FileUtils.readFileToString(f);
204
                theContents = StringUtils.trim(theContents);
205
                if (StringUtils.isNotEmpty(theContents)) {
206
                    String theCharset = toCharsetName(theContents);
207
                    this.charsetName = theCharset;
208
                    this.source = f.getAbsoluteFile();
209
                }
210
            } catch (IOException e) {
211
                logger.warn("Couldn't read " + FILE_EXTENSION + " file (" + f.getAbsolutePath() + ").", e);
212
            }
213
        }
214
    }
215

    
216
    @Override
217
    public void write(File file) throws IOException {
218
        File f = this.getFile(file);
219
        try {
220
            String export = toCPGName(this.charsetName) + "\n";
221
            FileUtils.writeStringToFile(f, export, "ISO-8859-1");
222
            this.source = f;
223
        } catch (Exception e) {
224
            logger.warn("Couldn't write " + FILE_EXTENSION + " file (" + f.getAbsolutePath() + ").", e);
225
            throw e;
226
        }
227
    }
228

    
229
}