deduper is a set of functionalities that helps the user to implement a deduplication process. Based on some implementation details from: https://addi.ehu.es/handle/10810/28984?locale-attribute=en
mySourcesCollection
.collect(Sources.collector())
.block(this::blockingPredicate)
.deriving()
.withFeatureDerivers(getFeatureDerivers())
.derive()
.writeToCsv("myDataSet.csv");
ByteArrayOutputStream stream;
BufferedWriter bufferedWriter;
Instances instances;
stream = new ByteArrayOutputStream();
bufferedWriter = new BufferedWriter(new OutputStreamWriter(stream));
sources.stream()
.collect(Sources.collector())
.onlyIn(test)
.block(this::blockingPredicate)
.deriving()
.withFeatureDerivers(getFeatureDerivers())
.withBuckets(test)
.derive()
.writeToCsv(bufferedWriter);
bufferedWriter.close();
instances = WekaUtils.getCsvInstances(new BufferedInputStream(new ByteArrayInputStream(stream.toByteArray())));
PairResolution resolution = Solver.pairResolve(abstractClassifier, instances, threshold);
Buckets<String> clusters = resolution.toNormalizedClusters();
GMD gmd = new GMD();
GmdCost cost = gmd.cost(clusters, buckets);