Commit 3963d544 authored by R.W.Majeed's avatar R.W.Majeed

use specified text encoding for reading csv files

parent 16f06ff0
......@@ -6,6 +6,7 @@ import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.time.Instant;
import com.opencsv.CSVParser;
......@@ -19,7 +20,7 @@ public class FileRowSupplier extends RowSupplier {
private Instant timestamp;
public FileRowSupplier(URL location, String fieldSeparator) throws IOException{
public FileRowSupplier(URL location, String fieldSeparator, Charset charset) throws IOException{
if( fieldSeparator.length() > 1 ){
if( fieldSeparator.equals("\\t") ){
fieldSeparator = "\t";
......@@ -28,7 +29,7 @@ public class FileRowSupplier extends RowSupplier {
}
}
this.url = location;
this.in = new CSVReader(new InputStreamReader(location.openStream()),fieldSeparator.charAt(0), CSVParser.DEFAULT_QUOTE_CHARACTER, (char)0);
this.in = new CSVReader(new InputStreamReader(location.openStream(), charset),fieldSeparator.charAt(0), CSVParser.DEFAULT_QUOTE_CHARACTER, (char)0);
// TODO: check whether needed to close underlying InputStream
......
......@@ -3,6 +3,7 @@ package de.sekmi.histream.etl.config;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Pattern;
import javax.xml.bind.annotation.XmlAccessType;
......@@ -32,7 +33,7 @@ public class CsvFile extends TableSource{
String url;
/**
* File encoding is not used yet.
* Encoding to use for reading text files
*/
@XmlElement
String encoding;
......@@ -44,11 +45,11 @@ public class CsvFile extends TableSource{
@XmlElement
String separator;
@XmlElement
String quote;
@XmlElement
char escape;
// @XmlElement
// String quote;
//
// @XmlElement
// char escape;
private CsvFile(){
}
......@@ -59,9 +60,18 @@ public class CsvFile extends TableSource{
}
@Override
public RowSupplier rows(Meta meta) throws IOException {
// resolve url relative to base url from metadata
URL base = meta.getLocation();
URL source = (base == null)?new URL(url):new URL(base, url);
return new FileRowSupplier(source, separator);
// determine charset
Charset charset;
if( encoding != null ) {
charset = Charset.forName(encoding);
}else{
// if not defined, use system charset
charset = Charset.defaultCharset();
}
return new FileRowSupplier(source, separator, charset);
}
}
package de.sekmi.histream.etl;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
import org.junit.Test;
......@@ -9,7 +10,7 @@ public class TestRowSupplier {
@Test
public void testLoadRows() throws IOException{
try( FileRowSupplier r = new FileRowSupplier(getClass().getResource("/data/test-1-patients.txt"), "\t") ){
try( FileRowSupplier r = new FileRowSupplier(getClass().getResource("/data/test-1-patients.txt"), "\t", StandardCharsets.ISO_8859_1) ){
String[] h = r.getHeaders();
Assert.assertEquals("patid", h[0]);
Assert.assertEquals("nachname", h[2]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment