Skip to content
GitLab
Explore
Sign in
Commits on Source (6)
ordering by accurate first, if min instants are equal
· cfca417d
R.W.Majeed
authored
Feb 16, 2018
cfca417d
added sort function and removeIndex
· 2fd1ed3e
R.W.Majeed
authored
Feb 16, 2018
2fd1ed3e
implementation of duplicate filter
· d284a522
R.W.Majeed
authored
Feb 16, 2018
d284a522
fixed sort algorithm
· b573713d
R.W.Majeed
authored
Feb 16, 2018
b573713d
test case added for duplicate-filter
· e065815b
R.W.Majeed
authored
Feb 16, 2018
e065815b
more duplicate data
· 1b7d7b48
R.W.Majeed
authored
Feb 17, 2018
1b7d7b48
Hide whitespace changes
Inline
Side-by-side
histream-core/src/main/java/de/sekmi/histream/DateTimeAccuracy.java
View file @
1b7d7b48
...
...
@@ -386,7 +386,12 @@ public class DateTimeAccuracy implements Comparable<DateTimeAccuracy> {
}
@Override
public
int
compareTo
(
DateTimeAccuracy
o
)
{
return
instant
.
compareTo
(
o
.
instant
);
int
cmp
=
instant
.
compareTo
(
o
.
instant
);
// if instants are equal, order by accuracy. more accurate comes first
if
(
cmp
==
0
){
cmp
=
accuracy
.
compareTo
(
o
.
accuracy
);
}
return
cmp
;
}
@Override
...
...
histream-core/src/test/java/de/sekmi/histream/TestDateTimeAccuracy.java
View file @
1b7d7b48
...
...
@@ -164,5 +164,19 @@ public class TestDateTimeAccuracy {
// date should be treated as if it had a +08:00 offset
assertEquals
(
"2001-02-02T20Z"
,
a
.
toPartialIso8601
(
ZoneId
.
of
(
"UTC"
)));
}
@Test
public
void
verifyComparison
()
throws
ParseException
{
ZoneId
zone
=
ZoneOffset
.
UTC
.
normalized
();
DateTimeAccuracy
a
=
DateTimeAccuracy
.
parsePartialIso8601
(
"2001-02-03T04"
,
zone
);
DateTimeAccuracy
b
=
DateTimeAccuracy
.
parsePartialIso8601
(
"2001-02-03T04:00"
,
zone
);
// instant min are equal
Assert
.
assertEquals
(
a
.
toInstantMin
(),
b
.
toInstantMin
());
// yet a and b are not equal
Assert
.
assertNotEquals
(
a
,
b
);
// defined order
Assert
.
assertTrue
(
a
.
compareTo
(
b
)
>
0
);
Assert
.
assertTrue
(
b
.
compareTo
(
a
)
<
0
);
}
// TODO: further tests
}
histream-import/src/main/java/de/sekmi/histream/etl/filter/DuplicateFactFilter.java
View file @
1b7d7b48
package
de.sekmi.histream.etl.filter
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Comparator
;
import
java.util.HashSet
;
import
javax.xml.bind.annotation.XmlElement
;
import
javax.xml.bind.annotation.XmlType
;
import
de.sekmi.histream.DateTimeAccuracy
;
import
de.sekmi.histream.scripting.AbstractFacts
;
import
de.sekmi.histream.scripting.Fact
;
@XmlType
(
name
=
"duplicate-fact"
)
public
class
DuplicateFactFilter
extends
PostProcessingFilter
{
...
...
@@ -11,10 +18,55 @@ public class DuplicateFactFilter extends PostProcessingFilter{
@XmlElement
public
String
[]
concept
;
private
static
class
FactComparator
implements
Comparator
<
Fact
>{
@Override
public
int
compare
(
Fact
o1
,
Fact
o2
)
{
int
cmp
=
o1
.
getObservation
().
getStartTime
().
compareTo
(
o2
.
getObservation
().
getStartTime
()
);
if
(
cmp
==
0
){
// if times are equal, sort by concept
cmp
=
o1
.
getConcept
().
compareTo
(
o2
.
getConcept
());
}
return
cmp
;
}
}
private
void
removeAllDuplicates
(
AbstractFacts
facts
){
// order by start and concept
facts
.
sort
(
new
FactComparator
()
);
ArrayList
<
Integer
>
duplicates
=
new
ArrayList
<>();
// iterate through facts and store duplicate indices
DateTimeAccuracy
start
=
null
;
String
concept
=
null
;
for
(
int
i
=
0
;
i
<
facts
.
size
();
i
++
){
Fact
fact
=
facts
.
get
(
i
);
if
(
start
!=
null
){
// nothing to do for first fact
if
(
start
.
equals
(
fact
.
getObservation
().
getStartTime
())
){
// start time is equal, check if same concept
if
(
concept
.
equals
(
fact
.
getConcept
())
){
// found duplicate
duplicates
.
add
(
i
);
}
}
}
// remember previous concept
start
=
fact
.
getObservation
().
getStartTime
();
concept
=
fact
.
getConcept
();
}
// remove duplicates last first
while
(
!
duplicates
.
isEmpty
()
){
int
index
=
duplicates
.
remove
(
duplicates
.
size
()-
1
);
facts
.
removeIndex
(
index
);
}
}
@Override
public
void
processVisit
(
AbstractFacts
facts
)
{
// TODO Auto-generated method stub
// create set for O(1) lookup
// HashSet<String> match = new HashSet<>(concept.length);
// Collections.addAll(match, concept);
// TODO implement for limited concepts
removeAllDuplicates
(
facts
);
}
}
histream-import/src/test/java/de/sekmi/histream/etl/validation/TestValidator.java
View file @
1b7d7b48
...
...
@@ -5,6 +5,7 @@ import org.junit.Test;
import
de.sekmi.histream.ObservationSupplier
;
import
de.sekmi.histream.etl.ETLObservationSupplier
;
import
de.sekmi.histream.etl.config.DataSource
;
import
de.sekmi.histream.io.Streams
;
public
class
TestValidator
{
...
...
@@ -69,4 +70,14 @@ public class TestValidator {
}
Assert
.
fail
(
"Exception expected"
);
}
@Test
public
void
validateData4WithDuplicateFilter
()
throws
Exception
{
// duplicate concepts
try
(
ObservationSupplier
os
=
ETLObservationSupplier
.
load
(
getClass
().
getResource
(
"/data/test-4-datasource2.xml"
))
){
Validator
v
=
new
Validator
(
true
,
true
);
v
.
setErrorHandler
(
e
->
{
throw
new
RuntimeException
(
e
);});
Streams
.
transfer
(
os
,
v
);
}
// no duplicate concept exception should occur
}
}
histream-import/src/test/resources/data/test-4-datasource2.xml
0 → 100644
View file @
1b7d7b48
<?xml version="1.0" encoding="UTF-8"?>
<datasource
version=
"1.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
>
<!-- Duplicate concept for patient -->
<meta>
<id>
LTx_PH_ILD_COPD
</id>
<etl-strategy>
replace-source
</etl-strategy>
</meta>
<patient-table>
<source
xsi:type=
"csv-file"
>
<url>
test-4-table.csv
</url>
<separator>
;
</separator>
</source>
<idat>
<!-- Vorname, Nachname, Geschlecht unbekannt -->
<patient-id
column=
"Pheno-ID"
/>
<given-name
column=
"Pheno-ID"
/>
<surname
column=
"Pheno-ID"
/>
</idat>
<ignore
xsi:type=
"string"
column=
"*"
/>
</patient-table>
<visit-table>
<source
xsi:type=
"csv-file"
>
<url>
test-4-table.csv
</url>
<separator>
;
</separator>
</source>
<idat>
<patient-id
column=
"Pheno-ID"
/>
<visit-id
column=
"Zeitpunkt"
/>
<start
column=
"Zeitpunkt"
format=
"d.M.u[ H[:m[:s]]]"
/>
</idat>
<ignore
xsi:type=
"string"
column=
"*"
/>
</visit-table>
<eav-table>
<source
xsi:type=
"csv-file"
>
<url>
test-4-table.csv
</url>
<separator>
;
</separator>
</source>
<idat>
<patient-id
column=
"Pheno-ID"
/>
<visit-id
column=
"Zeitpunkt"
/>
</idat>
<mdat>
<concept
column=
"Export-Param"
/>
<start
column=
"Zeitpunkt"
format=
"d.M.u[ H[:m[:s]]]"
/>
<end
column=
"Zeitpunkt"
format=
"d.M.u[ H[:m[:s]]]"
/>
<type
constant-value=
"string"
/>
<value
column=
"Wert"
na=
""
/>
<unit
column=
"Einheiten"
na=
""
/>
</mdat>
<virtual>
<value
column=
"Diagnose"
xsi:type=
"string"
na=
""
>
<map>
<otherwise
log-warning=
"Unexpected value"
action=
"drop-fact"
/>
</map>
</value>
<value
column=
"Zusatzdiagnose"
xsi:type=
"string"
na=
""
>
<map>
<case
value=
"IPF"
set-concept=
"B:DP-ID-IPF"
set-value=
""
/>
<case
value=
"UIP"
set-concept=
"B:DP-ID-IPF"
set-value=
""
/>
<otherwise
log-warning=
"Unexpected value"
action=
"drop-fact"
/>
</map>
</value>
<value
column=
"Probenart"
xsi:type=
"string"
na=
""
>
<map>
<otherwise
action=
"drop-fact"
/>
</map>
</value>
<value
column=
"Diesease Area"
xsi:type=
"string"
na=
""
>
<map>
<otherwise
action=
"drop-fact"
/>
</map>
</value>
</virtual>
<ignore
xsi:type=
"string"
column=
"*"
/>
</eav-table>
<post-processing>
<filter
xsi:type=
"duplicate-fact"
/>
</post-processing>
</datasource>
histream-import/src/test/resources/data/test-4-table.csv
View file @
1b7d7b48
Pheno-ID;Bereich;Zeitpunkt;Export-Param;Wert;Einheiten;
Mmqp212;Biobank-IDs;21.04.2016;Zusatzdiagnose;UIP;;
Mmqp212;Biobank-IDs;21.04.2016;Zusatzdiagnose;IPF;;
\ No newline at end of file
Mmqp212;Biobank-IDs;21.04.2016;Zusatzdiagnose;IPF;;
Mmqp212;Biobank-IDs;21.04.2016;B;IPF;;
Mmqp212;Biobank-IDs;21.04.2016;C;IPF;;
Mmqp212;Biobank-IDs;21.04.2016;B;IPF;;
histream-js/src/main/java/de/sekmi/histream/scripting/AbstractFacts.java
View file @
1b7d7b48
package
de.sekmi.histream.scripting
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.Comparator
;
import
java.util.List
;
import
de.sekmi.histream.Observation
;
...
...
@@ -51,13 +53,18 @@ public abstract class AbstractFacts {
if
(
i
==
-
1
){
return
null
;
}
else
{
Fact
f
=
facts
.
remove
(
i
);
Observation
o
=
sourceList
.
remove
(
i
);
// verify that fact and observation are associated
assert
f
.
getObservation
()
==
o
;
return
f
;
return
removeIndex
(
i
);
}
}
public
Fact
removeIndex
(
int
index
){
Fact
f
=
facts
.
remove
(
index
);
Observation
o
=
sourceList
.
remove
(
index
);
// verify that fact and observation are associated
assert
f
.
getObservation
()
==
o
;
return
f
;
}
public
Fact
get
(
int
index
){
return
facts
.
get
(
index
);
}
...
...
@@ -81,4 +88,37 @@ public abstract class AbstractFacts {
return
f
;
}
public
void
sort
(
Comparator
<
Fact
>
comparator
){
Integer
[]
indices
=
new
Integer
[
facts
.
size
()];
for
(
int
i
=
0
;
i
<
indices
.
length
;
i
++
){
indices
[
i
]
=
i
;
}
// determine sort order
Arrays
.
sort
(
indices
,
new
Comparator
<
Integer
>()
{
@Override
public
int
compare
(
Integer
o1
,
Integer
o2
)
{
return
comparator
.
compare
(
facts
.
get
(
o1
),
facts
.
get
(
o2
));
}
});
// reorder both arrays
for
(
int
i
=
0
;
i
<
indices
.
length
;
i
++
){
while
(
i
!=
indices
[
i
]
){
// store old target values which will be overridden
int
oldI
=
indices
[
indices
[
i
]];
Fact
oldF
=
facts
.
get
(
indices
[
i
]);
Observation
oldO
=
sourceList
.
get
(
indices
[
i
]);
// replace target values
facts
.
set
(
indices
[
i
],
facts
.
get
(
i
));
sourceList
.
set
(
indices
[
i
],
sourceList
.
get
(
i
));
indices
[
indices
[
i
]]
=
indices
[
i
];
// move old targets to old values
indices
[
i
]
=
oldI
;
facts
.
set
(
i
,
oldF
);
sourceList
.
set
(
i
,
oldO
);
}
}
}
}