From 3963d544e7756ba47c40ce64b0ad9ee9e0d8e20e Mon Sep 17 00:00:00 2001 From: "R.W.Majeed" Date: Thu, 21 Jun 2018 15:44:13 +0200 Subject: [PATCH] use specified text encoding for reading csv files --- .../sekmi/histream/etl/FileRowSupplier.java | 5 ++-- .../de/sekmi/histream/etl/config/CsvFile.java | 24 +++++++++++++------ .../sekmi/histream/etl/TestRowSupplier.java | 3 ++- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/histream-import/src/main/java/de/sekmi/histream/etl/FileRowSupplier.java b/histream-import/src/main/java/de/sekmi/histream/etl/FileRowSupplier.java index ce5a5fe..cd026e2 100644 --- a/histream-import/src/main/java/de/sekmi/histream/etl/FileRowSupplier.java +++ b/histream-import/src/main/java/de/sekmi/histream/etl/FileRowSupplier.java @@ -6,6 +6,7 @@ import java.io.InputStreamReader; import java.io.UncheckedIOException; import java.net.URL; import java.net.URLConnection; +import java.nio.charset.Charset; import java.time.Instant; import com.opencsv.CSVParser; @@ -19,7 +20,7 @@ public class FileRowSupplier extends RowSupplier { private Instant timestamp; - public FileRowSupplier(URL location, String fieldSeparator) throws IOException{ + public FileRowSupplier(URL location, String fieldSeparator, Charset charset) throws IOException{ if( fieldSeparator.length() > 1 ){ if( fieldSeparator.equals("\\t") ){ fieldSeparator = "\t"; @@ -28,7 +29,7 @@ public class FileRowSupplier extends RowSupplier { } } this.url = location; - this.in = new CSVReader(new InputStreamReader(location.openStream()),fieldSeparator.charAt(0), CSVParser.DEFAULT_QUOTE_CHARACTER, (char)0); + this.in = new CSVReader(new InputStreamReader(location.openStream(), charset),fieldSeparator.charAt(0), CSVParser.DEFAULT_QUOTE_CHARACTER, (char)0); // TODO: check whether needed to close underlying InputStream diff --git a/histream-import/src/main/java/de/sekmi/histream/etl/config/CsvFile.java b/histream-import/src/main/java/de/sekmi/histream/etl/config/CsvFile.java index 92d5b92..d793cf4 100644 --- a/histream-import/src/main/java/de/sekmi/histream/etl/config/CsvFile.java +++ b/histream-import/src/main/java/de/sekmi/histream/etl/config/CsvFile.java @@ -3,6 +3,7 @@ package de.sekmi.histream.etl.config; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.nio.charset.Charset; import java.util.regex.Pattern; import javax.xml.bind.annotation.XmlAccessType; @@ -32,7 +33,7 @@ public class CsvFile extends TableSource{ String url; /** - * File encoding is not used yet. + * Encoding to use for reading text files */ @XmlElement String encoding; @@ -44,11 +45,11 @@ public class CsvFile extends TableSource{ @XmlElement String separator; - @XmlElement - String quote; - - @XmlElement - char escape; +// @XmlElement +// String quote; +// +// @XmlElement +// char escape; private CsvFile(){ } @@ -59,9 +60,18 @@ public class CsvFile extends TableSource{ } @Override public RowSupplier rows(Meta meta) throws IOException { + // resolve url relative to base url from metadata URL base = meta.getLocation(); URL source = (base == null)?new URL(url):new URL(base, url); - return new FileRowSupplier(source, separator); + // determine charset + Charset charset; + if( encoding != null ) { + charset = Charset.forName(encoding); + }else{ + // if not defined, use system charset + charset = Charset.defaultCharset(); + } + return new FileRowSupplier(source, separator, charset); } } diff --git a/histream-import/src/test/java/de/sekmi/histream/etl/TestRowSupplier.java b/histream-import/src/test/java/de/sekmi/histream/etl/TestRowSupplier.java index 9b7c93c..965864c 100644 --- a/histream-import/src/test/java/de/sekmi/histream/etl/TestRowSupplier.java +++ b/histream-import/src/test/java/de/sekmi/histream/etl/TestRowSupplier.java @@ -1,6 +1,7 @@ package de.sekmi.histream.etl; import java.io.IOException; +import java.nio.charset.StandardCharsets; import org.junit.Assert; import org.junit.Test; @@ -9,7 +10,7 @@ public class TestRowSupplier { @Test public void testLoadRows() throws IOException{ - try( FileRowSupplier r = new FileRowSupplier(getClass().getResource("/data/test-1-patients.txt"), "\t") ){ + try( FileRowSupplier r = new FileRowSupplier(getClass().getResource("/data/test-1-patients.txt"), "\t", StandardCharsets.ISO_8859_1) ){ String[] h = r.getHeaders(); Assert.assertEquals("patid", h[0]); Assert.assertEquals("nachname", h[2]); -- GitLab