...
 
Commits (6)
......@@ -386,7 +386,12 @@ public class DateTimeAccuracy implements Comparable<DateTimeAccuracy> {
}
@Override
public int compareTo(DateTimeAccuracy o) {
return instant.compareTo(o.instant);
int cmp = instant.compareTo(o.instant);
// if instants are equal, order by accuracy. more accurate comes first
if( cmp == 0 ){
cmp = accuracy.compareTo(o.accuracy);
}
return cmp;
}
@Override
......
......@@ -164,5 +164,19 @@ public class TestDateTimeAccuracy {
// date should be treated as if it had a +08:00 offset
assertEquals("2001-02-02T20Z", a.toPartialIso8601(ZoneId.of("UTC")));
}
@Test
public void verifyComparison() throws ParseException{
ZoneId zone = ZoneOffset.UTC.normalized();
DateTimeAccuracy a = DateTimeAccuracy.parsePartialIso8601("2001-02-03T04", zone);
DateTimeAccuracy b = DateTimeAccuracy.parsePartialIso8601("2001-02-03T04:00", zone);
// instant min are equal
Assert.assertEquals(a.toInstantMin(), b.toInstantMin());
// yet a and b are not equal
Assert.assertNotEquals(a, b);
// defined order
Assert.assertTrue(a.compareTo(b) > 0);
Assert.assertTrue(b.compareTo(a) < 0);
}
// TODO: further tests
}
package de.sekmi.histream.etl.filter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlType;
import de.sekmi.histream.DateTimeAccuracy;
import de.sekmi.histream.scripting.AbstractFacts;
import de.sekmi.histream.scripting.Fact;
@XmlType(name="duplicate-fact")
public class DuplicateFactFilter extends PostProcessingFilter{
......@@ -11,10 +18,55 @@ public class DuplicateFactFilter extends PostProcessingFilter{
@XmlElement
public String[] concept;
private static class FactComparator implements Comparator<Fact>{
@Override
public int compare(Fact o1, Fact o2) {
int cmp = o1.getObservation().getStartTime().compareTo(
o2.getObservation().getStartTime() );
if( cmp == 0 ){
// if times are equal, sort by concept
cmp = o1.getConcept().compareTo(o2.getConcept());
}
return cmp;
}
}
private void removeAllDuplicates(AbstractFacts facts){
// order by start and concept
facts.sort( new FactComparator() );
ArrayList<Integer> duplicates = new ArrayList<>();
// iterate through facts and store duplicate indices
DateTimeAccuracy start = null;
String concept = null;
for( int i=0; i<facts.size(); i++ ){
Fact fact = facts.get(i);
if( start != null ){// nothing to do for first fact
if( start.equals(fact.getObservation().getStartTime()) ){
// start time is equal, check if same concept
if( concept.equals(fact.getConcept()) ){
// found duplicate
duplicates.add(i);
}
}
}
// remember previous concept
start = fact.getObservation().getStartTime();
concept = fact.getConcept();
}
// remove duplicates last first
while( !duplicates.isEmpty() ){
int index = duplicates.remove(duplicates.size()-1);
facts.removeIndex(index);
}
}
@Override
public void processVisit(AbstractFacts facts) {
// TODO Auto-generated method stub
// create set for O(1) lookup
// HashSet<String> match = new HashSet<>(concept.length);
// Collections.addAll(match, concept);
// TODO implement for limited concepts
removeAllDuplicates(facts);
}
}
......@@ -5,6 +5,7 @@ import org.junit.Test;
import de.sekmi.histream.ObservationSupplier;
import de.sekmi.histream.etl.ETLObservationSupplier;
import de.sekmi.histream.etl.config.DataSource;
import de.sekmi.histream.io.Streams;
public class TestValidator {
......@@ -69,4 +70,14 @@ public class TestValidator {
}
Assert.fail("Exception expected");
}
@Test
public void validateData4WithDuplicateFilter() throws Exception{
// duplicate concepts
try( ObservationSupplier os = ETLObservationSupplier.load(getClass().getResource("/data/test-4-datasource2.xml")) ){
Validator v = new Validator(true,true);
v.setErrorHandler(e -> {throw new RuntimeException(e);});
Streams.transfer(os, v);
}
// no duplicate concept exception should occur
}
}
<?xml version="1.0" encoding="UTF-8"?>
<datasource version="1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<!-- Duplicate concept for patient -->
<meta>
<id>LTx_PH_ILD_COPD</id>
<etl-strategy>replace-source</etl-strategy>
</meta>
<patient-table>
<source xsi:type="csv-file">
<url>test-4-table.csv</url>
<separator>;</separator>
</source>
<idat>
<!-- Vorname, Nachname, Geschlecht unbekannt -->
<patient-id column="Pheno-ID"/>
<given-name column="Pheno-ID"/>
<surname column="Pheno-ID"/>
</idat>
<ignore xsi:type="string" column="*"/>
</patient-table>
<visit-table>
<source xsi:type="csv-file">
<url>test-4-table.csv</url>
<separator>;</separator>
</source>
<idat>
<patient-id column="Pheno-ID"/>
<visit-id column="Zeitpunkt"/>
<start column="Zeitpunkt" format="d.M.u[ H[:m[:s]]]"/>
</idat>
<ignore xsi:type="string" column="*"/>
</visit-table>
<eav-table>
<source xsi:type="csv-file">
<url>test-4-table.csv</url>
<separator>;</separator>
</source>
<idat>
<patient-id column="Pheno-ID"/>
<visit-id column="Zeitpunkt"/>
</idat>
<mdat>
<concept column="Export-Param"/>
<start column="Zeitpunkt" format="d.M.u[ H[:m[:s]]]"/>
<end column="Zeitpunkt" format="d.M.u[ H[:m[:s]]]"/>
<type constant-value="string"/>
<value column="Wert" na=""/>
<unit column="Einheiten" na=""/>
</mdat>
<virtual>
<value column="Diagnose" xsi:type="string" na="">
<map>
<otherwise log-warning="Unexpected value" action="drop-fact" />
</map>
</value>
<value column="Zusatzdiagnose" xsi:type="string" na="">
<map>
<case value="IPF" set-concept="B:DP-ID-IPF" set-value=""/>
<case value="UIP" set-concept="B:DP-ID-IPF" set-value=""/>
<otherwise log-warning="Unexpected value" action="drop-fact" />
</map>
</value>
<value column="Probenart" xsi:type="string" na="">
<map>
<otherwise action="drop-fact"/>
</map>
</value>
<value column="Diesease Area" xsi:type="string" na="">
<map>
<otherwise action="drop-fact"/>
</map>
</value>
</virtual>
<ignore xsi:type="string" column="*"/>
</eav-table>
<post-processing>
<filter xsi:type="duplicate-fact"/>
</post-processing>
</datasource>
Pheno-ID;Bereich;Zeitpunkt;Export-Param;Wert;Einheiten;
Mmqp212;Biobank-IDs;21.04.2016;Zusatzdiagnose;UIP;;
Mmqp212;Biobank-IDs;21.04.2016;Zusatzdiagnose;IPF;;
\ No newline at end of file
Mmqp212;Biobank-IDs;21.04.2016;Zusatzdiagnose;IPF;;
Mmqp212;Biobank-IDs;21.04.2016;B;IPF;;
Mmqp212;Biobank-IDs;21.04.2016;C;IPF;;
Mmqp212;Biobank-IDs;21.04.2016;B;IPF;;
package de.sekmi.histream.scripting;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import de.sekmi.histream.Observation;
......@@ -51,13 +53,18 @@ public abstract class AbstractFacts {
if( i == -1 ){
return null;
}else{
Fact f = facts.remove(i);
Observation o = sourceList.remove(i);
// verify that fact and observation are associated
assert f.getObservation() == o;
return f;
return removeIndex(i);
}
}
public Fact removeIndex(int index){
Fact f = facts.remove(index);
Observation o = sourceList.remove(index);
// verify that fact and observation are associated
assert f.getObservation() == o;
return f;
}
public Fact get(int index){
return facts.get(index);
}
......@@ -81,4 +88,37 @@ public abstract class AbstractFacts {
return f;
}
public void sort(Comparator<Fact> comparator){
Integer[] indices = new Integer[facts.size()];
for( int i=0; i<indices.length; i++ ){
indices[i] = i;
}
// determine sort order
Arrays.sort(indices, new Comparator<Integer>() {
@Override
public int compare(Integer o1, Integer o2) {
return comparator.compare(facts.get(o1), facts.get(o2));
}
});
// reorder both arrays
for( int i=0; i<indices.length; i++ ){
while( i != indices[i] ){
// store old target values which will be overridden
int oldI = indices[indices[i]];
Fact oldF = facts.get(indices[i]);
Observation oldO = sourceList.get(indices[i]);
// replace target values
facts.set(indices[i], facts.get(i));
sourceList.set(indices[i], sourceList.get(i));
indices[indices[i]] = indices[i];
// move old targets to old values
indices[i] = oldI;
facts.set(i, oldF);
sourceList.set(i, oldO);
}
}
}
}