From 6fda2bd70b86499ac8df23241ee8cc904b4b4061 Mon Sep 17 00:00:00 2001 From: deysn Date: Tue, 19 Jun 2018 12:40:41 -0400 Subject: [PATCH 01/29] TWRPv2 added , removed some classes, DBFriendly Projection fixed. TWRPv2 is the working version where only the centroids and counts are stored, instead of the whole set of vectors. Fixed DBFriendly Projection. Removed some classes that were not used. --- .classpath | 18 +- src/main/java/edu/uc/rphash/RPHash.java | 53 +- .../edu/uc/rphash/RPHashAdaptive2Pass.java | 113 ++- .../rphash/RPHashAdaptive2PassParallel.java | 370 --------- .../java/edu/uc/rphash/RPHashMultiProj.java | 307 ------- .../java/edu/uc/rphash/RPHashStreamingAK.java | 198 ----- .../edu/uc/rphash/Readers/RPHashObject.java | 14 +- src/main/java/edu/uc/rphash/TWRP1.java | 763 ++++++++++++++++++ src/main/java/edu/uc/rphash/TWRPv2.java | 534 ++++++++++++ .../java/edu/uc/rphash/decoders/Golay.java | 350 -------- .../java/edu/uc/rphash/decoders/PsdLSH.java | 252 ------ .../KHHCentroidCounterPush.java | 56 -- .../edu/uc/rphash/knee/BiggestMergeKnee.java | 38 - .../edu/uc/rphash/knee/KneeAlgorithm.java | 7 - .../java/edu/uc/rphash/knee/LpointKnee.java | 66 -- .../java/edu/uc/rphash/knee/SimpleKnee.java | 25 - .../projections/DBFriendlyProjection.java | 20 + .../edu/uc/rphash/standardhash/CrapWow.java | 121 --- .../edu/uc/rphash/tests/ScalabilityTest.java | 130 --- .../java/edu/uc/rphash/tests/TestRPhash.java | 4 +- .../tests/clusterers/AdaptiveMeanShift.java | 432 ---------- .../tests/clusterers/Agglomerative.java | 151 ---- .../tests/clusterers/Agglomerative2.java | 372 --------- .../uc/rphash/tests/clusterers/Kmeans.java | 330 -------- .../clusterers/LloydIterativeKmeans.java | 250 ------ .../edu/uc/rphash/tests/clusterers/MLE2.java | 333 -------- .../clusterers/MaxLikelihoodKMeans2.java | 451 ----------- .../edu/uc/rphash/tests/clusterers/SVD.java | 456 ----------- .../uc/rphash/tests/testStreamingRPHash.java | 184 ----- 29 files changed, 1427 insertions(+), 4971 deletions(-) delete mode 100644 src/main/java/edu/uc/rphash/RPHashAdaptive2PassParallel.java delete mode 100644 src/main/java/edu/uc/rphash/RPHashMultiProj.java delete mode 100644 src/main/java/edu/uc/rphash/RPHashStreamingAK.java create mode 100644 src/main/java/edu/uc/rphash/TWRP1.java create mode 100644 src/main/java/edu/uc/rphash/TWRPv2.java delete mode 100644 src/main/java/edu/uc/rphash/decoders/Golay.java delete mode 100644 src/main/java/edu/uc/rphash/decoders/PsdLSH.java delete mode 100644 src/main/java/edu/uc/rphash/frequentItemSet/KHHCentroidCounterPush.java delete mode 100644 src/main/java/edu/uc/rphash/knee/BiggestMergeKnee.java delete mode 100644 src/main/java/edu/uc/rphash/knee/KneeAlgorithm.java delete mode 100644 src/main/java/edu/uc/rphash/knee/LpointKnee.java delete mode 100644 src/main/java/edu/uc/rphash/knee/SimpleKnee.java delete mode 100644 src/main/java/edu/uc/rphash/standardhash/CrapWow.java delete mode 100644 src/main/java/edu/uc/rphash/tests/ScalabilityTest.java delete mode 100644 src/main/java/edu/uc/rphash/tests/clusterers/AdaptiveMeanShift.java delete mode 100644 src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative.java delete mode 100644 src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative2.java delete mode 100644 src/main/java/edu/uc/rphash/tests/clusterers/Kmeans.java delete mode 100644 src/main/java/edu/uc/rphash/tests/clusterers/LloydIterativeKmeans.java delete mode 100644 src/main/java/edu/uc/rphash/tests/clusterers/MLE2.java delete mode 100644 src/main/java/edu/uc/rphash/tests/clusterers/MaxLikelihoodKMeans2.java delete mode 100644 src/main/java/edu/uc/rphash/tests/clusterers/SVD.java delete mode 100644 src/main/java/edu/uc/rphash/tests/testStreamingRPHash.java diff --git a/.classpath b/.classpath index 29836c6..8d805ce 100644 --- a/.classpath +++ b/.classpath @@ -1,9 +1,9 @@ - - - - - - - - - + + + + + + + + + diff --git a/src/main/java/edu/uc/rphash/RPHash.java b/src/main/java/edu/uc/rphash/RPHash.java index e62b867..2977bc4 100644 --- a/src/main/java/edu/uc/rphash/RPHash.java +++ b/src/main/java/edu/uc/rphash/RPHash.java @@ -22,11 +22,11 @@ import edu.uc.rphash.decoders.DepthProbingLSH; import edu.uc.rphash.decoders.Dn; import edu.uc.rphash.decoders.E8; -import edu.uc.rphash.decoders.Golay; + import edu.uc.rphash.decoders.Leech; import edu.uc.rphash.decoders.MultiDecoder; import edu.uc.rphash.decoders.OriginDecoder; -import edu.uc.rphash.decoders.PsdLSH; + import edu.uc.rphash.decoders.Spherical; import edu.uc.rphash.projections.DBFriendlyProjection; import edu.uc.rphash.projections.FJLTProjection; @@ -34,13 +34,13 @@ import edu.uc.rphash.projections.NoProjection; import edu.uc.rphash.projections.SVDProjection; import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.clusterers.AdaptiveMeanShift; + import edu.uc.rphash.tests.clusterers.Agglomerative3; import edu.uc.rphash.tests.clusterers.DummyClusterer; import edu.uc.rphash.tests.clusterers.DBScan; import edu.uc.rphash.tests.clusterers.KMeans2; import edu.uc.rphash.tests.clusterers.KMeansPlusPlus; -import edu.uc.rphash.tests.clusterers.LloydIterativeKmeans; + import edu.uc.rphash.tests.clusterers.MultiKMPP; import edu.uc.rphash.tests.clusterers.StreamingKmeans; import edu.uc.rphash.tests.clusterers.StreamingKmeans2; @@ -552,11 +552,7 @@ public static List runConfigs(List untaggedArgs, so.setDecoderType(new E8(2f)); break; } - case "golay": { - o.setDecoderType(new Golay()); - so.setDecoderType(new Golay()); - break; - } + case "multie8": { o.setDecoderType(new MultiDecoder( o.getInnerDecoderMultiplier() * 8, new E8(2f))); @@ -576,23 +572,9 @@ public static List runConfigs(List untaggedArgs, .getInnerDecoderMultiplier() * 24, new Leech())); break; } - case "levypstable": { - o.setDecoderType(new PsdLSH(PsdLSH.LEVY, o.getDimparameter())); - so.setDecoderType(new PsdLSH(PsdLSH.LEVY, o.getDimparameter())); - break; - } - case "cauchypstable": { - o.setDecoderType(new PsdLSH(PsdLSH.CAUCHY, o.getDimparameter())); - so.setDecoderType(new PsdLSH(PsdLSH.CAUCHY, o.getDimparameter())); - break; - } - case "gaussianpstable": { - o.setDecoderType(new PsdLSH(PsdLSH.GAUSSIAN, o - .getDimparameter())); - so.setDecoderType(new PsdLSH(PsdLSH.GAUSSIAN, o - .getDimparameter())); - break; - } + + + case "sphere": {// pad to ~32 bits // int ctsofsphere = // (int)(Math.log(o.getDimparameter()*2)/Math.log(2.0)) /2; @@ -659,13 +641,7 @@ public static List runConfigs(List untaggedArgs, o.setOfflineClusterer(new KMeansPlusPlus()); so.setOfflineClusterer(new KMeansPlusPlus()); break; - case "adaptivemeanshift": { - - o.setOfflineClusterer(new AdaptiveMeanShift()); - so.setOfflineClusterer(new AdaptiveMeanShift()); - - break; - } + case "kmpp": { o.setOfflineClusterer(new KMeansPlusPlus()); @@ -727,11 +703,7 @@ public static List runConfigs(List untaggedArgs, runitems.add(new KMeans2(k, o.getRawData())); break; } - case "pkmeans": - runitems.add(new LloydIterativeKmeans(k, o.getRawData(), o - .getNumProjections())); - break; - + case "kmeansplusplus": runitems.add(new KMeansPlusPlus(o.getRawData(), k)); break; @@ -743,10 +715,7 @@ public static List runConfigs(List untaggedArgs, runitems.add(new StreamingKmeans2(o)); break; } - case "adaptivemeanshift": { - runitems.add(new AdaptiveMeanShift()); - break; - } + case "adaptive": { runitems.add(new RPHashAdaptive2Pass(o)); break; diff --git a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java index 013ea27..1ff2008 100644 --- a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java +++ b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java @@ -1,8 +1,10 @@ package edu.uc.rphash; +import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; @@ -16,11 +18,12 @@ import edu.uc.rphash.tests.StatTests; import edu.uc.rphash.tests.clusterers.Agglomerative3; import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; public class RPHashAdaptive2Pass implements Clusterer, Runnable { - boolean znorm = true; + boolean znorm = false; private int counter; @@ -63,17 +66,21 @@ float[] medoid(List X) { //float[] rngvec; the range vector is moot if incoming data has been normalized //post normalization it should all be zero centered, with variance 1 - /* * super simple hash algorithm, reminiscient of pstable lsh */ + // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. + public long hashvec(float[] xt, float[] x, HashMap> IDAndCent, HashMap> IDAndLabel,int ct) { - long s = 1;//fixes leading 0's bug + long s = 1; //fixes leading 0's bug for (int i = 0; i < xt.length; i++) { - s <<= 1; +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. if (xt[i] > rngvec[i]) - s += 1; +// s += 1; + s= s+1; + if (IDAndCent.containsKey(s)) { IDAndLabel.get(s).add(ct); IDAndCent.get(s).add(x); @@ -143,16 +150,17 @@ public List> findDensityModes() { projector.init(); int ct = 0; - if(znorm == true){ - float[] variance = StatTests.varianceCol(so.getRawData()); - float[] mean = StatTests.meanCols(so.getRawData()); - // #process data by adding to the counter - for (float[] x : so.getRawData()) - { - addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); - } - } - else +// if(znorm == true){ +// float[] variance = StatTests.varianceCol(so.getRawData()); +// float[] mean = StatTests.meanCols(so.getRawData()); +// // #process data by adding to the counter +// for (float[] x : so.getRawData()) +// { +// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); +// } +// } +// +// else { for (float[] x : so.getRawData()) @@ -161,6 +169,34 @@ public List> findDensityModes() { } } + + for (Long name: IDAndCent.keySet()){ + + String key =name.toString(); + + + String value = IDAndCent.get(name).toString() ; + + // String value1 = Arrays.toString(value.toString()); + + System.out.println(key + " " + value); + + + + +} + + for (Long name: IDAndID.keySet()){ + + String key =name.toString(); + String value = IDAndID.get(name).toString(); + System.out.println(key + " " + value); + + +} + + + // next we want to prune the tree by parent count comparison // follows breadthfirst search HashMap denseSetOfIDandCount = new HashMap(); @@ -235,6 +271,7 @@ public void run() { centroids.add(medoid(clustermembers.get(i))); } Agglomerative3 aggloOffline = new Agglomerative3(centroids, so.getk()); + System.out.println(centroids.size()); aggloOffline.setWeights(weights); this.centroids = aggloOffline.getCentroids(); } @@ -242,26 +279,26 @@ public void run() { public static void main(String[] args) throws FileNotFoundException, IOException { - int k = 10; - int d = 1000; - int n = 10000; - float var = 0.1f; - int count = 10; - System.out.printf("ClusterVar\t"); - for (int i = 0; i < count; i++) - System.out.printf("Trial%d\t", i); - System.out.printf("RealWCSS\n"); - - for (float f = var; f < 5.01; f += .05f) { + int k = 3; + int d = 100; + int n = 2000; + float var = 1.0f;//0.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + for (float f = var; f < 1.01; f += 1.5f) { float avgrealwcss = 0; float avgtime = 0; - System.out.printf("%f\t", f); + // System.out.printf("%f\t", f); for (int i = 0; i < count; i++) { - GenerateData gen = new GenerateData(k, n / k, d, f, true, .5f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); // gen.writeCSVToFile(new // File("/home/lee/Desktop/reclsh/in.csv")); RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDimparameter(32); + o.setDimparameter(4); RPHashAdaptive2Pass rphit = new RPHashAdaptive2Pass(o); long startTime = System.nanoTime(); List centsr = rphit.getCentroids(); @@ -271,12 +308,22 @@ public static void main(String[] args) throws FileNotFoundException, avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), gen.getData()); - System.out.printf("%.0f\t", - StatTests.WCSSECentroidsFloat(centsr, gen.data)); - System.gc(); + + String Output = "/C:/Users/user/Desktop/temp/OutputTwrpCents" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + // System.out.printf("%.0f\t", + // StatTests.WCSSECentroidsFloat(centsr, gen.data)); + // System.gc(); } - System.out.printf("%.0f\n", avgrealwcss / count); + // System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + + } @Override diff --git a/src/main/java/edu/uc/rphash/RPHashAdaptive2PassParallel.java b/src/main/java/edu/uc/rphash/RPHashAdaptive2PassParallel.java deleted file mode 100644 index 4ad318a..0000000 --- a/src/main/java/edu/uc/rphash/RPHashAdaptive2PassParallel.java +++ /dev/null @@ -1,370 +0,0 @@ -package edu.uc.rphash; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Random; -import java.util.TreeSet; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.Future; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.clusterers.Agglomerative3; -import edu.uc.rphash.tests.generators.GenerateData; - -public class RPHashAdaptive2PassParallel implements Clusterer, Runnable { - - boolean znorm = true; - - private float[] rngvec; - private List centroids = null; - private RPHashObject so; - int threads = 4; - - public RPHashAdaptive2PassParallel(RPHashObject so) { - this.threads = 4; - this.so = so; - } - - public RPHashAdaptive2PassParallel(List data, int k, int processors) { - this.threads = processors; - so = new SimpleArrayReader(data, k); - } - - public List getCentroids(RPHashObject so) { - this.so = so; - return getCentroids(); - } - - @Override - public List getCentroids() { - if (centroids == null) - run(); - return centroids; - } - - /* - * X - set of vectors compute the medoid of a vector set - */ - float[] medoid(List X) { - float[] ret = X.get(0); - for (int i = 1; i < X.size(); i++) { - for (int j = 0; j < ret.length; j++) { - ret[j] += X.get(i)[j]; - } - } - for (int j = 0; j < ret.length; j++) { - ret[j] = ret[j] / ((float) X.size()); - } - return ret; - } - - // float[] rngvec; the range vector is moot if incoming data has been - // normalized - // post normalization it should all be zero centered, with variance 1 - - /* - * super simple hash algorithm, reminiscient of pstable lsh - */ - public long hashvec(float[] xt, float[] x, - Map> IDAndCent, - Map> IDAndLabel, int ct) { - long s = 1;// fixes leading 0's bug - for (int i = 0; i < xt.length; i++) { - s <<= 1; - if (xt[i] > rngvec[i]) - s += 1; - if (IDAndCent.containsKey(s)) { - if (IDAndLabel.get(s) != null) - IDAndLabel.get(s).add(ct); - if (IDAndCent.get(s) != null) - IDAndCent.get(s).add(x); - } else { - ArrayList xlist = new ArrayList<>(); - xlist.add(x); - IDAndCent.put(s, xlist); - ArrayList idlist = new ArrayList<>(); - idlist.add(ct); - IDAndLabel.put(s, idlist); - } - } - return s; - } - - /* - * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid - * vector map - * - * hash the projected vector x and update the hash to centroid and counts - * maps - */ - void addtocounter(float[] x, Projector p, - Map> IDAndCent, - Map> IDandID, int ct) { - float[] xt = p.project(x); - - hashvec(xt, x, IDAndCent, IDandID, ct); - } - - /* - * X - data set k - canonical k in k-means l - clustering sub-space Compute - * density mode via iterative deepening hash counting - */ - public Collection> findDensityModes() - throws InterruptedException, ExecutionException { - - // #create projector matrixs - Projector projector = so.getProjectionType(); - projector.setOrigDim(so.getdim()); - projector.setProjectedDim(so.getDimparameter()); - projector.setRandomSeed(so.getRandomSeed()); - projector.init(); - - // int ct = 1; - - List dat = so.getRawData(); - - //this counter gets shared - AtomicInteger ct = new AtomicInteger(0); - - ForkJoinPool executor = new ForkJoinPool(this.threads); - - int chunksize = dat.size() / this.threads; - - //This is the array of essentially thread objets that process in parallel - ArrayList>>> gather = new ArrayList<>(this.threads); - - for (int i = 0; i < this.threads; i++) { - - int chunk = chunksize* i; - gather.add(executor.submit(new Callable>>() { - - // this is the mapper function. the dataset is split among the processing threads - // each thread performs the projections and counter adds. - // this method is sequentially bottlenecked in regard to the add part - // there are some ways to fix this, but ultimately each thread needs to maintain - // its own count-sketch. then those sketch must be merged, via the binary - // operation - public Map> call() { - Map> IDAndCent = new HashMap<>(); - Map> IDAndID = new HashMap<>(); - for (int j = chunk; j < chunksize + chunk && j < dat.size(); j++) { - addtocounter(dat.get(j), projector, IDAndCent, IDAndID, - ct.incrementAndGet()); - } - return IDAndCent ;//new Object[] { IDAndCent, IDAndID }; - } - })); - } - - List>> gatheredCent = new ArrayList<>(this.threads); -// List> gatheredID = new ArrayList<>(this.threads); - -// executor.awaitTermination(10,TimeUnit.SECONDS); - for (Future>> f : gather) { - Map> o = f.get(); - gatheredCent.add(o); -// gatheredID.add((Map) o[1]); - } - - executor.shutdown(); - - - // this function merges the centroid sets in parallel. - // it would be the basis of the reduce part - // even though the functions are called map, the return is a collection/gather operation - Map IDAndCent = gatheredCent - .stream() - .parallel() - .map(Map::entrySet) - .flatMap(Collection::stream) - .collect( - Collectors.toConcurrentMap(Map.Entry::getKey, - Map.Entry::getValue, - (old, latest)->{ - old.addAll(latest); - return old; - } - )); - - - //this is sequential... - // next we want to prune the tree by parent count comparison - // follows breadthfirst search - HashMap denseSetOfIDandCount = new HashMap(); - for (Long cur_id : new TreeSet(IDAndCent.keySet())) { - if (cur_id > so.getk()) { - int cur_count = IDAndCent.get(cur_id).size(); - long parent_id = cur_id >>> 1; - int parent_count = IDAndCent.get(parent_id).size(); - - if (cur_count != 0 && parent_count != 0) { - if (cur_count == parent_count) { - denseSetOfIDandCount.put(parent_id, 0L); - IDAndCent.put(parent_id, new ArrayList<>()); - denseSetOfIDandCount.put(cur_id, (long) cur_count); - } else { - if (2 * cur_count > parent_count) { - denseSetOfIDandCount.remove(parent_id); - IDAndCent.put(parent_id, new ArrayList<>()); - denseSetOfIDandCount.put(cur_id, (long) cur_count); - } - } - } - } - } - - // remove keys with support less than 1 - Stream> stream = denseSetOfIDandCount.entrySet() - .parallelStream().filter(p -> p.getValue() > 1); - // 64 so 6 bits? - // stream = stream.filter(p -> p.getKey() > 64); - - List sortedIDList = new ArrayList<>(); - // sort and limit the list - stream.sorted(Entry. comparingByValue().reversed()) - .limit(so.getk() * 4).parallel() - .forEachOrdered(x -> sortedIDList.add(x.getKey())); - - // compute centroids - - HashMap> estcents = new HashMap<>(); - for (int i = 0; i < sortedIDList.size(); i++) - { - estcents.put(sortedIDList.get(i), - new ArrayList(IDAndCent.get(sortedIDList.get(i)))); - } - - return estcents.values(); - } - - public void run() { - rngvec = new float[so.getDimparameter()]; - Random r = new Random(so.getRandomSeed()); - for (int i = 0; i < so.getDimparameter(); i++) - rngvec[i] = (float) r.nextGaussian(); - - Collection> clustermembers; - try { - clustermembers = findDensityModes(); - - List centroids = new ArrayList<>(); - - List weights = new ArrayList<>(); - int k = clustermembers.size() > 200 + so.getk() ? 200 + so.getk() - : clustermembers.size(); - - for (List cl : clustermembers) { - weights.add(new Float(cl.size())); - centroids.add(medoid(cl)); - } - - Agglomerative3 aggloOffline = new Agglomerative3(centroids, - so.getk()); - aggloOffline.setWeights(weights); - this.centroids = aggloOffline.getCentroids(); - } catch (InterruptedException | ExecutionException e) { - - e.printStackTrace(); - } - } - - public static void main(String[] args) throws FileNotFoundException, - IOException { - - int k = 10; - int d = 1000; - int n = 10000; - float var = 1.1f; - int count = 10; - System.out.printf("ClusterVar\t"); - for (int i = 0; i < count; i++) - System.out.printf("Trial%d\t", i); - System.out.printf("RealWCSS\n"); - - for (float f = var; f < 5.01; f += .05f) { - float avgrealwcss = 0; - float avgtime = 0; - System.out.printf("%f\t", f); - for (int i = 0; i < count; i++) { - GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); - // gen.writeCSVToFile(new - // File("/home/lee/Desktop/reclsh/in.csv")); - RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDimparameter(32); - - RPHashAdaptive2PassParallel rphit = new RPHashAdaptive2PassParallel( - o); - long startTime = System.nanoTime(); - List centsr = rphit.getCentroids(); - - avgtime += (System.nanoTime() - startTime) / 100000000; - - avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), - gen.getData()); - - System.out.printf("%.0f\t", - StatTests.WCSSECentroidsFloat(centsr, gen.data)); - System.gc(); - } - System.out.printf("%.0f\n", avgrealwcss / count); - } - } - - @Override - public RPHashObject getParam() { - return so; - } - - @Override - public void setWeights(List counts) { - // TODO Auto-generated method stub - - } - - @Override - public void setData(List centroids) { - this.centroids = centroids; - - } - - @Override - public void setRawData(List centroids) { - if (this.centroids == null) - this.centroids = new ArrayList<>(centroids.size()); - for (float[] f : centroids) { - this.centroids.add(new Centroid(f, 0)); - } - } - - @Override - public void setK(int getk) { - this.so.setK(getk); - } - - @Override - public void reset(int randomseed) { - centroids = null; - so.setRandomSeed(randomseed); - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } -} diff --git a/src/main/java/edu/uc/rphash/RPHashMultiProj.java b/src/main/java/edu/uc/rphash/RPHashMultiProj.java deleted file mode 100644 index 18f6f47..0000000 --- a/src/main/java/edu/uc/rphash/RPHashMultiProj.java +++ /dev/null @@ -1,307 +0,0 @@ -package edu.uc.rphash; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.decoders.Decoder; -import edu.uc.rphash.decoders.Leech; -import edu.uc.rphash.decoders.Spherical; -import edu.uc.rphash.frequentItemSet.ItemSet; -import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; -import edu.uc.rphash.lsh.LSH; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.standardhash.HashAlgorithm; -import edu.uc.rphash.standardhash.NoHash; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.generators.GenerateData; - -/** - * This is the correlated multi projections approach. In this RPHash variation - * we try to incorporate the advantage of multiple random projections in order - * to combat increasing cluster error rates as the deviation between projected - * and full data increases. The main idea is similar to the referential RPHash, - * however the set union is projection id dependent. This will be done in a - * simplified bitmask addition to the hash code in lieu of an array of sets data - * structures. - * - * @author lee - * - */ -public class RPHashMultiProj implements Clusterer { - float variance; - - public RPHashObject map() { - Iterator vecs = so.getVectorIterator(); - if (!vecs.hasNext()) - return so; - - long[] hash; - int projections = so.getNumProjections(); - - int k = (int) (so.getk() * 2); - - // initialize our counter - ItemSet is = new SimpleFrequentItemSet(k); - // create our LSH Device - // create same LSH Device as before - - Random r = new Random(so.getRandomSeed()); - LSH[] lshfuncs = new LSH[projections]; - Decoder dec = so.getDecoderType(); - dec.setCounter(is); - HashAlgorithm hal = new NoHash(so.getHashmod()); - - // create same projection matrices as before - for (int i = 0; i < projections; i++) { - Projector p = so.getProjectionType(); - p.setOrigDim(so.getdim()); - p.setProjectedDim(dec.getDimensionality()); - p.setRandomSeed(r.nextLong()); - p.init(); - - List noise = LSH.genNoiseTable(dec.getDimensionality(), - so.getNumBlur(), r, - dec.getErrorRadius() / dec.getDimensionality()); - - lshfuncs[i] = new LSH(dec, p, hal, noise, so.getNormalize()); - } - - // add to frequent itemset the hashed Decoded randomly projected vector - while (vecs.hasNext()) { - float[] vec = vecs.next(); - // iterate over the multiple projections - for (LSH lshfunc : lshfuncs) { - // could do a big parallel projection here - hash = lshfunc.lshHashRadius(vec, so.getNumBlur()); - for (long hh : hash) { - is.add(hh); - } - } - } - so.setPreviousTopID(is.getTop()); - List countsAsFloats = new ArrayList(); - for (long ct : is.getCounts()) - countsAsFloats.add((float) ct); - so.setCounts(countsAsFloats); - return so; - } - - /* - * This is the second phase after the top ids have been in the reduce phase - * aggregated - */ - public RPHashObject reduce() { - Iterator vecs = so.getVectorIterator(); - if (!vecs.hasNext()) - return so; - - // make a set of k default centroid objects - ArrayList centroids = new ArrayList(); - for (long id : so.getPreviousTopID()) - centroids.add(new Centroid(so.getdim(), id, -1)); - - long[] hash; - int projections = so.getNumProjections(); - - // create our LSH Device - // create same LSH Device as before - Random r = new Random(so.getRandomSeed()); - LSH[] lshfuncs = new LSH[projections]; - Decoder dec = so.getDecoderType(); - HashAlgorithm hal = new NoHash(so.getHashmod()); - - // create same projection matrices as before - for (int i = 0; i < projections; i++) { - Projector p = so.getProjectionType(); - p.setOrigDim(so.getdim()); - p.setProjectedDim(dec.getDimensionality()); - p.setRandomSeed(r.nextLong()); - p.init(); - List noise = LSH.genNoiseTable(dec.getDimensionality(), - so.getNumBlur(), r, - dec.getErrorRadius() / dec.getDimensionality()); - lshfuncs[i] = new LSH(dec, p, hal, noise, so.getNormalize()); - } - - while (vecs.hasNext()) { - float[] vec = vecs.next(); - // iterate over the multiple projections - for (LSH lshfunc : lshfuncs) { - // could do a big parallel projection here - hash = lshfunc.lshHashRadius(vec, so.getNumBlur()); - for (Centroid cent : centroids) { - for (long hh : hash) { - if (cent.ids.contains(hh)) { - cent.updateVec(vec); - cent.addID(hh); - } - } - } - } - } - so.setCentroids(centroids); - return so; - } - - private List centroids = null; - private RPHashObject so; - private int runs; - - public RPHashMultiProj(int k, List data) { - so = new SimpleArrayReader(data, k); - runs = 1; - } - - public RPHashMultiProj(RPHashObject so) { - this.so = so; - } - - public RPHashMultiProj() { - so = new SimpleArrayReader(); - } - - public List getCentroids(RPHashObject so) { - this.so = so; - - if (centroids == null) - run(); - return centroids; - } - - @Override - public List getCentroids() { - if (centroids == null) { - run(); - } - return centroids; - } - - private void run() { - runs = 1; - double minwcss = Double.MAX_VALUE; - List mincentroids = new ArrayList<>(); - for (int currun = 0; currun < runs;) { - - map(); - reduce(); - - Clusterer offlineclusterer = so.getOfflineClusterer(); - List tmpcents; - if (offlineclusterer != null) { - offlineclusterer.setMultiRun(1);// is deterministic - offlineclusterer.setData(so.getCentroids()); - offlineclusterer.setWeights(so.getCounts()); - offlineclusterer.setK(so.getk()); - tmpcents = offlineclusterer.getCentroids(); - } else { - tmpcents = so.getCentroids().subList(0, so.getk()); - } - - if (tmpcents.size() == so.getk()) {// skip bad clusterings - double tmpwcss = StatTests.WCSSECentroidsFloat(tmpcents, - so.getRawData()); - // System.out.println(tmpwcss + ":" + so.getCounts()); - if (tmpwcss < minwcss) { - minwcss = tmpwcss; - mincentroids = tmpcents; - } - currun++; - } - - this.reset(new Random().nextInt()); - - } - - this.centroids = mincentroids; - } - - public static void main(String[] args) { - - int k = 10; - int d = 1000; - int n = 10000; - float var = .6f; - int count = 5; - System.out.printf("Decoder: %s\n","Spherical"); - System.out.printf("ClusterVar\t"); - for (int i = 0; i < count; i++) - System.out.printf("Trial%d\t", i); - System.out.printf("RealWCSS\n"); - - - - for (float f = var; f < 3.01; f += .1f) { - float avgrealwcss = 0; - float avgtime = 0; - System.out.printf("%f\t", f); - for (int i = 0; i < count; i++) { - GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); - RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDecoderType(new Spherical(32,4,1)); - o.setDimparameter(32); - RPHashMultiProj rphit = new RPHashMultiProj(o); - long startTime = System.nanoTime(); - List centsr = rphit.getCentroids(); - - avgtime += (System.nanoTime() - startTime) / 100000000; - - avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), - gen.getData()); - - System.out.printf("%.0f\t", - StatTests.WCSSECentroidsFloat(centsr, gen.data)); - System.gc(); - - } - System.out.printf("%.0f\n", avgrealwcss / count); - } - } - - @Override - public RPHashObject getParam() { - return so; - } - - @Override - public void setWeights(List counts) { - } - - @Override - public void setData(List data) { - centroids = new ArrayList<>(); - for (Centroid c : data) { - so.addRawData(c.centroid); - } - so.setDimparameter(data.get(0).dimensions); - } - - @Override - public void setK(int getk) { - this.so.setK(getk); - } - - @Override - public void setRawData(List data) { - so.setRawData(data); - this.so.setDimparameter(data.get(0).length); - } - - @Override - public void reset(int randomseed) { - centroids = null; - so.setRandomSeed(randomseed); - } - - @Override - public boolean setMultiRun(int runs) { - this.runs = runs; - return true; - } - -} diff --git a/src/main/java/edu/uc/rphash/RPHashStreamingAK.java b/src/main/java/edu/uc/rphash/RPHashStreamingAK.java deleted file mode 100644 index 873c20d..0000000 --- a/src/main/java/edu/uc/rphash/RPHashStreamingAK.java +++ /dev/null @@ -1,198 +0,0 @@ -package edu.uc.rphash; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.decoders.Decoder; -import edu.uc.rphash.frequentItemSet.KHHCentroidCounterPush; -import edu.uc.rphash.knee.LpointKnee; -import edu.uc.rphash.lsh.LSH; -import edu.uc.rphash.projections.DBFriendlyProjection; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.standardhash.HashAlgorithm; -import edu.uc.rphash.standardhash.MurmurHash; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.clusterers.LloydIterativeKmeans; -import edu.uc.rphash.tests.generators.ClusterGenerator; - -/**This is an adaptation of RPHash Streaming with support for - * automatic knee finding and time based cluster decay. - * @author lee - * - */ -public class RPHashStreamingAK implements StreamClusterer { - - public KHHCentroidCounterPush is; - private LSH[] lshfuncs; - private StatTests vartracker; - private List centroids = null; - private RPHashObject so; - - - @Override - public synchronized long addVectorOnlineStep(float[] vec) { - - if(!lshfuncs[0].lshDecoder.selfScaling()){ - this.vartracker.updateVarianceSampleVec(vec); - vec = this.vartracker.scaleVector(vec); - } - - - Centroid c = new Centroid(vec,-1); - int ret = -1; - - for (LSH lshfunc : lshfuncs) { - if (so.getNumBlur() != 1) { - long[] hash = lshfunc - .lshHashRadius(vec, so.getNumBlur()); - for (long h : hash) { - c.addID(h); - is.addLong(h, 1); - } - } else { - long hash = lshfunc.lshHash(vec); - c.addID(hash); - is.addLong(hash, 1); - } - } - ret = is.addAndUpdate(c); - - return ret; - } - - public void init() { - Random r = new Random(so.getRandomSeed()); - this.vartracker = new StatTests(.01f); - int projections = so.getNumProjections(); - - // initialize our counter - float decayrate = so.getDecayRate();// 1f;// bottom number is window - // size - is = new KHHCentroidCounterPush(decayrate,new LpointKnee()); - // create LSH Device - lshfuncs = new LSH[projections]; - Decoder dec = so.getDecoderType(); - HashAlgorithm hal = new MurmurHash(so.getHashmod()); - // create projection matrices add to LSH Device - for (int i = 0; i < projections; i++) { - Projector p = new DBFriendlyProjection(so.getdim(), - dec.getDimensionality(), r.nextLong()); - List noise = LSH.genNoiseTable(dec.getDimensionality(), - so.getNumBlur(), r, dec.getErrorRadius() - / dec.getDimensionality()); - lshfuncs[i] = new LSH(dec, p, hal, noise,so.getNormalize()); - } - } - - public RPHashStreamingAK(ClusterGenerator c) { - so = new SimpleArrayReader(c,0); - init(); - } - - public RPHashStreamingAK(RPHashObject so) { - this.so = so; - init(); - } - - - - @Override - public List getCentroids() { - if (centroids == null) { - init(); - run(); - getCentroidsOfflineStep(); - } - return centroids; - } - - public List getCentroidsOfflineStep() { - - centroids = is.getTop(); - - -// centroids = new ArrayList(); -// List counts = is.getCounts(); -// -// for (int i = 0; i < cents.size(); i++) { -// centroids.add(cents.get(i).centroid()); -// } - - Clusterer offlineclusterer = so.getOfflineClusterer(); - offlineclusterer.setWeights(so.getCounts()); - offlineclusterer.setData(so.getCentroids()); - offlineclusterer.setK(so.getk()); - centroids = offlineclusterer.getCentroids(); - - return centroids; - } - - public void run() { - // add to frequent itemset the hashed Decoded randomly projected - // vector - Iterator vecs = so.getVectorIterator(); - while (vecs.hasNext()) { - addVectorOnlineStep(vecs.next()); - } - } - - public List getTopIdSizes() { - return is.getCounts(); - } - - @Override - public RPHashObject getParam() { - return this.so; - } - - @Override - public void setWeights(List counts) { - // TODO Auto-generated method stub - - } - - @Override - public void setRawData(List data) { -// this.data = data; - } - - @Override - public void setData(List centroids) { - ArrayList data = new ArrayList(centroids.size()); - for(Centroid c : centroids)data.add(c.centroid()); - setRawData(data); - } - - @Override - public void setK(int getk) { - // TODO Auto-generated method stub - - } - - @Override - public void shutdown() { - // TODO Auto-generated method stub - - } - - @Override - public void reset(int randomseed) { - centroids = null; - so.setRandomSeed(randomseed); - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } - - @Override - public int getProcessors() { - return 1; - } - -} diff --git a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java index 499ae9f..9f38946 100644 --- a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java +++ b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java @@ -8,20 +8,21 @@ import edu.uc.rphash.decoders.Decoder; import edu.uc.rphash.decoders.DepthProbingLSH; import edu.uc.rphash.decoders.E8; -import edu.uc.rphash.decoders.Golay; + import edu.uc.rphash.decoders.Leech; import edu.uc.rphash.decoders.MultiDecoder; -import edu.uc.rphash.decoders.PsdLSH; + import edu.uc.rphash.decoders.Spherical; import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.GaussianProjection; import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.tests.clusterers.Agglomerative; + import edu.uc.rphash.tests.clusterers.Agglomerative3; import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; import edu.uc.rphash.tests.clusterers.KMeans2; import edu.uc.rphash.tests.clusterers.KMeans2NoWCSS; import edu.uc.rphash.tests.clusterers.KMeansPlusPlus; -import edu.uc.rphash.tests.clusterers.Kmeans; + import edu.uc.rphash.tests.clusterers.MultiKMPP; import edu.uc.rphash.tests.clusterers.DBScan; @@ -41,8 +42,9 @@ public interface RPHashObject { //final static Clusterer DEFAULT_OFFLINE_CLUSTERER = new MultiKMPP(); - final static Projector DEFAULT_PROJECTOR = new DBFriendlyProjection(); - + final static Projector DEFAULT_PROJECTOR = new DBFriendlyProjection(); + //final static Projector DEFAULT_PROJECTOR = new GaussianProjection(); + int getdim(); Iterator getVectorIterator(); diff --git a/src/main/java/edu/uc/rphash/TWRP1.java b/src/main/java/edu/uc/rphash/TWRP1.java new file mode 100644 index 0000000..aaf3540 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRP1.java @@ -0,0 +1,763 @@ +package edu.uc.rphash; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; +import java.util.Map; + + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; + + + +public class TWRP1 implements Clusterer, Runnable { + + boolean znorm = false; + + + private int counter; + private float[] rngvec; + private List centroids = null; + + + private RPHashObject so; + + public TWRP1(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run2(); + return centroids; + } + + + + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + + //float[] rngvec; the range vector is moot if incoming data has been normalized + //post normalization it should all be zero centered, with variance 1 + + /* + * super simple hash algorithm, reminiscient of pstable lsh + */ + // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. + + public long hashvec(float[] xt, float[] x, + HashMap> IDAndCent, HashMap> IDAndLabel,int ct) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) +// s += 1; + s= s+1; + + if (IDAndCent.containsKey(s)) { + IDAndLabel.get(s).add(ct); + IDAndCent.get(s).add(x); + } else { + List xlist = new ArrayList<>(); + xlist.add(x); + IDAndCent.put(s, xlist); + List idlist = new ArrayList<>(); + idlist.add(ct); + IDAndLabel.put(s, idlist); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap> IDAndCent,HashMap> IDandID,int ct) { + float[] xt = p.project(x); + +// counter++; +// for(int i = 0;i> IDAndCent,HashMap> IDandID,int ct,float[] mean,float[] variance) + { + float[] xt = p.project(StatTests.znormvec(x, mean, variance)); + +// counter++; +// for(int i = 0;i> findDensityModes() { + HashMap> IDAndCent = new HashMap<>(); + HashMap> IDAndID = new HashMap<>(); + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + + int ct = 0; +// if(znorm == true){ +// float[] variance = StatTests.varianceCol(so.getRawData()); +// float[] mean = StatTests.meanCols(so.getRawData()); +// // #process data by adding to the counter +// for (float[] x : so.getRawData()) +// { +// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); +// } +// } +// +// else + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, IDAndCent, IDAndID,ct++); + } + } + + + for (Long name: IDAndCent.keySet()){ + + String key =name.toString(); + System.out.println(key ); + + // String value = IDAndCent.get(name).toString() ; + // String value1 = Arrays.toString(value.toString()); + + // System.out.println(key + " " + value); + + +} + + for (Long name: IDAndID.keySet()){ + + // String key =name.toString(); + // String value = IDAndID.get(name).toString(); + // System.out.println(key + " " + value); + + +} + + // we would compress the hashmaps. SetOfIDandCount has the ids and the counts corresponding to that id. + // we have two hashmaps: 1. IDAndCent and 2. IDAndID. we will use IDAndCent + + + HashMap MapOfIDAndCount = new HashMap(); + + HashMap MapOfIDAndCent = new HashMap(); + + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + int cur_count = IDAndCent.get(cur_id).size(); + + MapOfIDAndCount.put(cur_id, (long) cur_count); // this has the hashids and counts. + + List bucketpoints = new ArrayList<>(); + + Iterator e = IDAndCent.get(cur_id).iterator(); + + // int i=1; + while (e.hasNext()) { + + // System.out.println(i++); + + bucketpoints.add(e.next()) ; + + } + + float [] bucketcent; + + bucketcent = medoid(bucketpoints); + + MapOfIDAndCent.put(cur_id, bucketcent); // this has the hashids and centroids. + + // System.out.println(cur_id + " " + cur_count); + + // int c = MapOfIDAndCent.get(cur_id).length; + + // System.out.println(cur_id + " " + c); + + + } + + + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + HashMap denseSetOfIDandCount = new HashMap(); + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = IDAndCent.get(cur_id).size(); + long parent_id = cur_id>>>1; + int parent_count = IDAndCent.get(parent_id).size(); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount.put(parent_id, 0L); + IDAndCent.put(parent_id, new ArrayList<>()); + denseSetOfIDandCount.put(cur_id, (long) cur_count); + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount.remove(parent_id); + IDAndCent.put(parent_id, new ArrayList<>()); + denseSetOfIDandCount.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + //HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + //remove keys with support less than 1 + Stream> stream = denseSetOfIDandCount.entrySet().stream().filter(p -> p.getValue() > 1); + + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + + List sortedIDList= new ArrayList<>(); + // sort and limit the list + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList.add(x.getKey())); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + + // compute centroids + + HashMap> estcents = new HashMap<>(); + for (int i =0; i KeyAndCent = new HashMap<>(); + HashMap KeyAndCount = new HashMap<>(); + HashMap WeightAndCent = new HashMap<>(); + + for (int i =0; i(estcents.values()); + } + + + + + + + + + + + + public HashMap findDensityModes2() { + HashMap> IDAndCent = new HashMap<>(); + HashMap> IDAndID = new HashMap<>(); + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + + int ct = 0; +// if(znorm == true){ +// float[] variance = StatTests.varianceCol(so.getRawData()); +// float[] mean = StatTests.meanCols(so.getRawData()); +// // #process data by adding to the counter +// for (float[] x : so.getRawData()) +// { +// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); +// } +// } +// +// else + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, IDAndCent, IDAndID,ct++); + } + } + + + for (Long name: IDAndCent.keySet()){ + + String key =name.toString(); + System.out.println(key ); + + // String value = IDAndCent.get(name).toString() ; +// String value1 = Arrays.toString(value.toString()); + +// System.out.println(key + " " + value); + + +} + + for (Long name: IDAndID.keySet()){ + +// String key =name.toString(); +// String value = IDAndID.get(name).toString(); +// System.out.println(key + " " + value); + + +} + + // we would compress the hashmaps. SetOfIDandCount has the ids and the counts corresponding to that id. + // we have two hashmaps: 1. IDAndCent and 2. IDAndID. we will use IDAndCent + + + HashMap MapOfIDAndCount = new HashMap(); + + HashMap MapOfIDAndCent = new HashMap(); + + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + int cur_count = IDAndCent.get(cur_id).size(); + + MapOfIDAndCount.put(cur_id, (long) cur_count); // this has the hashids and counts. + + List bucketpoints = new ArrayList<>(); + + Iterator e = IDAndCent.get(cur_id).iterator(); + +// int i=1; + while (e.hasNext()) { + +// System.out.println(i++); + + bucketpoints.add(e.next()) ; + + } + + float [] bucketcent; + + bucketcent = medoid(bucketpoints); + + MapOfIDAndCent.put(cur_id, bucketcent); // this has the hashids and centroids. + +// System.out.println(cur_id + " " + cur_count); + + // int c = MapOfIDAndCent.get(cur_id).length; + + // System.out.println(cur_id + " " + c); + + + } + + + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + HashMap denseSetOfIDandCount = new HashMap(); + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = IDAndCent.get(cur_id).size(); + long parent_id = cur_id>>>1; + int parent_count = IDAndCent.get(parent_id).size(); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount.put(parent_id, 0L); + IDAndCent.put(parent_id, new ArrayList<>()); + denseSetOfIDandCount.put(cur_id, (long) cur_count); + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount.remove(parent_id); + IDAndCent.put(parent_id, new ArrayList<>()); + denseSetOfIDandCount.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + +//HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + //remove keys with support less than 1 + Stream> stream = denseSetOfIDandCount.entrySet().stream().filter(p -> p.getValue() > 1); + + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + + List sortedIDList= new ArrayList<>(); + // sort and limit the list + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList.add(x.getKey())); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + + // compute centroids + + HashMap> estcents = new HashMap<>(); + for (int i =0; i KeyAndCent = new HashMap<>(); + HashMap KeyAndCount = new HashMap<>(); + HashMap WeightAndCent = new HashMap<>(); + + for (int i =0; i(estcents.values()); + + return WeightAndCent; + + +} + + + + + + + public void run() { + rngvec = new float[so.getDimparameter()]; + counter = 0; + Random r = new Random(so.getRandomSeed()); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + List> clustermembers = findDensityModes(); + Listcentroids = new ArrayList<>(); + + List weights =new ArrayList<>(); + int k = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); + for(int i=0;i WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + int NumberOfMicroClusters = WeightAndClusters.size() ; + + + int k = NumberOfMicroClusters>200+so.getk()?200+so.getk():NumberOfMicroClusters; + + + // have to prune depending NumberOfMicroClusters returned. + + for (Long weights : new TreeSet(WeightAndClusters.keySet())) + + { + weights2.add((float)weights); + centroids2.add(WeightAndClusters.get(weights)); + } + + + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + + } + + + + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 6;//6; + int d = 64;//16; + int n = 700; + float var = .5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + for (float f = var; f < 1.01; f += 1.5f) { + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + for (int i = 0; i < count; i++) { + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + // gen.writeCSVToFile(new + // File("/home/lee/Desktop/reclsh/in.csv")); + RPHashObject o = new SimpleArrayReader(gen.data, k); + o.setDimparameter(4); + TWRP1 rphit = new TWRP1(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + // System.out.printf("%.0f\t", + // StatTests.WCSSECentroidsFloat(centsr, gen.data)); + // System.gc(); + } + System.out.printf("%.0f\n", avgrealwcss / count); + + } + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } +} diff --git a/src/main/java/edu/uc/rphash/TWRPv2.java b/src/main/java/edu/uc/rphash/TWRPv2.java new file mode 100644 index 0000000..08b05a2 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv2.java @@ -0,0 +1,534 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; +import java.util.Map; + + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + + + +public class TWRPv2 implements Clusterer, Runnable { + + boolean znorm = false; + + + private int counter; + private float[] rngvec; + private List centroids = null; + + + private RPHashObject so; + + public TWRPv2(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + + + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float[][] ret = new float[2][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + } + + + + + + //float[] rngvec; the range vector is moot if incoming data has been normalized + //post normalization it should all be zero centered, with variance 1 + + /* + * super simple hash algorithm, reminiscient of pstable lsh + */ + // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. + + public long hashvec(float[] xt, float[] x, + HashMap> IDAndCent, HashMap> IDAndLabel,int ct) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) +// s += 1; + s= s+1; + + if (IDAndCent.containsKey(s)) { + IDAndLabel.get(s).add(ct); + IDAndCent.get(s).add(x); + } else { + List xlist = new ArrayList<>(); + xlist.add(x); + IDAndCent.put(s, xlist); + List idlist = new ArrayList<>(); + idlist.add(ct); + IDAndLabel.put(s, idlist); + } + } + return s; + } + + public long hashvec2(float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount,int ct) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) +// s += 1; + s= s+1; + + + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, CountForIncomingVector, IncomingVector ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + } + + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap> IDAndCent,HashMap> IDandID,int ct) { + float[] xt = p.project(x); + +// counter++; +// for(int i = 0;i> IDAndCent,HashMap> IDandID,int ct,float[] mean,float[] variance) + { + float[] xt = p.project(StatTests.znormvec(x, mean, variance)); + +// counter++; +// for(int i = 0;i findDensityModes2() { + HashMap> IDAndCent = new HashMap<>(); + HashMap> IDAndID = new HashMap<>(); + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + + int ct = 0; +// if(znorm == true){ +// float[] variance = StatTests.varianceCol(so.getRawData()); +// float[] mean = StatTests.meanCols(so.getRawData()); +// // #process data by adding to the counter +// for (float[] x : so.getRawData()) +// { +// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); +// } +// } +// +// else + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, IDAndCent, IDAndID,ct++); + } + } + + + for (Long name: IDAndCent.keySet()){ + + String key =name.toString(); + System.out.println(key ); + + // String value = IDAndCent.get(name).toString() ; +// String value1 = Arrays.toString(value.toString()); + +// System.out.println(key + " " + value); + + +} + + for (Long name: IDAndID.keySet()){ + +// String key =name.toString(); +// String value = IDAndID.get(name).toString(); +// System.out.println(key + " " + value); + + +} + + // we would compress the hashmaps. SetOfIDandCount has the ids and the counts corresponding to that id. + // we have two hashmaps: 1. IDAndCent and 2. IDAndID. we will use IDAndCent + + + HashMap MapOfIDAndCount = new HashMap(); + + HashMap MapOfIDAndCent = new HashMap(); + + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + int cur_count = IDAndCent.get(cur_id).size(); + + MapOfIDAndCount.put(cur_id, (long) cur_count); // this has the hashids and counts. + + List bucketpoints = new ArrayList<>(); + + Iterator e = IDAndCent.get(cur_id).iterator(); + +// int i=1; + while (e.hasNext()) { + +// System.out.println(i++); + + bucketpoints.add(e.next()) ; + + } + + float [] bucketcent; + + bucketcent = medoid(bucketpoints); + + MapOfIDAndCent.put(cur_id, bucketcent); // this has the hashids and centroids. + +// System.out.println(cur_id + " " + cur_count); + + // int c = MapOfIDAndCent.get(cur_id).length; + + // System.out.println(cur_id + " " + c); + + + } + +// int NumberOfMicroClustersBeforePruning = MapOfIDAndCent.size() ; +// System.out.println("NumberOfMicroClustersBeforePruning = "+ NumberOfMicroClustersBeforePruning); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + +//HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + //remove keys with support less than 1 + + + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + + + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + + + HashMap KeyAndCent = new HashMap<>(); + HashMap KeyAndCount = new HashMap<>(); + HashMap WeightAndCent = new HashMap<>(); + + for (int i =0; i WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + int NumberOfMicroClusters = WeightAndClusters.size() ; + System.out.println("NumberOfMicroClusters = "+ NumberOfMicroClusters); + + // int k = NumberOfMicroClusters>200+so.getk()?200+so.getk():NumberOfMicroClusters; + + // have to prune depending NumberOfMicroClusters returned. + + for (Long weights : new TreeSet(WeightAndClusters.keySet())) + + { + weights2.add((float)weights); + centroids2.add(WeightAndClusters.get(weights)); + } + + + //System.out.printf("\tvalueofK is "); + //System.out.println( so.getk()); + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + //System.out.println(centroids2.size()); + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + + } + + + + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 5;//6; + int d = 100;//16; + int n = 5000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/user/Desktop/temp/OutputTwrpCents" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + // gen.writeCSVToFile(new + // File("/home/lee/Desktop/reclsh/in.csv")); + RPHashObject o = new SimpleArrayReader(gen.data, k); + o.setDimparameter(8); + + TWRPv2 rphit = new TWRPv2(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + // System.out.printf("%.0f\t", + // StatTests.WCSSECentroidsFloat(centsr, gen.data)); + // System.gc(); + + // System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } +} diff --git a/src/main/java/edu/uc/rphash/decoders/Golay.java b/src/main/java/edu/uc/rphash/decoders/Golay.java deleted file mode 100644 index 6392790..0000000 --- a/src/main/java/edu/uc/rphash/decoders/Golay.java +++ /dev/null @@ -1,350 +0,0 @@ -package edu.uc.rphash.decoders; - -import java.util.Arrays; -import java.util.Random; - -import edu.uc.rphash.frequentItemSet.Countable; -import edu.uc.rphash.standardhash.MurmurHash; -import edu.uc.rphash.util.VectorUtil; - -public class Golay implements Decoder{ - /** - * Utility methods that converts a binary string into and int. - * - * @param str a string containing a binary number - * - * @return the numeric value of the supplied string - */ - - private static int fromBinary(final String str) { - return Integer.parseInt(str, 2); - } - - /** - * Mask that preserves the last 12 bits (bits in dataword). - */ - - private static final int MASK = 0xfff; //== fromBinary("111111111111"); - - /** - * Generator matrix for the code, multiplied with a dataword to generate a codeword. - */ - - private static final int[] sGenerator = { - - fromBinary("100000000000"), - fromBinary("010000000000"), - fromBinary("001000000000"), - fromBinary("000100000000"), - fromBinary("000010000000"), - fromBinary("000001000000"), - fromBinary("000000100000"), - fromBinary("000000010000"), - fromBinary("000000001000"), - fromBinary("000000000100"), - fromBinary("000000000010"), - fromBinary("000000000001"), - - /* ALTERNATIVE MATRIX - UNUSED - fromBinary("110111000101"), - fromBinary("101110001011"), - fromBinary("011100010111"), - fromBinary("111000101101"), - fromBinary("110001011011"), - fromBinary("100010110111"), - fromBinary("000101101111"), - fromBinary("001011011101"), - fromBinary("010110111001"), - fromBinary("101101110001"), - fromBinary("011011100011"), - fromBinary("111111111110"), - */ - - fromBinary("011111111111"), - fromBinary("111011100010"), - fromBinary("110111000101"), - fromBinary("101110001011"), - fromBinary("111100010110"), - fromBinary("111000101101"), - fromBinary("110001011011"), - fromBinary("100010110111"), - fromBinary("100101101110"), - fromBinary("101011011100"), - fromBinary("110110111000"), - fromBinary("101101110001"), - }; - - /** - * Transpose of the generator matrix, multiplied with a codeword to generate a syndrome. - */ - - private static final int[] sCheck = { - - fromBinary("011111111111100000000000"), - fromBinary("111011100010010000000000"), - fromBinary("110111000101001000000000"), - fromBinary("101110001011000100000000"), - fromBinary("111100010110000010000000"), - fromBinary("111000101101000001000000"), - fromBinary("110001011011000000100000"), - fromBinary("100010110111000000010000"), - fromBinary("100101101110000000001000"), - fromBinary("101011011100000000000100"), - fromBinary("110110111000000000000010"), - fromBinary("101101110001000000000001"), - - }; - - /** - * A 4096 (2^12) element array that maps datawords to codewords. - */ - - private static final int[] sCodewords; - - /** - * A 4096 (2^12) element array that maps syndromes to error bits. - */ - - private static final int[] sErrors; - - //static initialization - static { - sCodewords = computeCodewords(); - sErrors = computeErrors(); - } - - /** - * Generates the codewords array. - * - * @return an array for assignment to {@link sCodewords} - */ - - private static int[] computeCodewords() { - int[] cws = new int[4096]; - //iterate over all valid datawords - for (int i = 0; i < 4096; i++) { - //multiply dataword by generator matrix - int cw = 0; - for (int j = 0; j < 24; j++) { - int d = i & sGenerator[j]; - int p = Integer.bitCount(d); - cw = (cw << 1) | (p & 1); - } - //store resulting codeword - cws[i] = cw; - } - return cws; - } - - /** - * Generates error array. - * - * @return an array for assignment to {@link sErrors} - */ - - private static int[] computeErrors() { - int[] errors = new int[4096]; - //fill array with -1 (indicates that error cannot be corrected - Arrays.fill(errors, -1); - - //record syndrome for zero error (valid) word - { - int error = 0; - int syn = syndrome(error); - errors[syn] = error; - } - - //record syndrome for each single error word - for (int i = 0; i < 24; i++) { - int error = 1 << i; - int syn = syndrome(error); - errors[syn] = error; - } - - //record syndrome for each double error word - for (int i = 1; i < 24; i++) { - for (int j = 0; j < i; j++) { - int error = (1 << i) | (1 << j); - int syn = syndrome(error); - errors[syn] = error; - } - } - - //record syndrome for each triple error word - for (int i = 2; i < 24; i++) { - for (int j = 1; j < i; j++) { - for (int k = 0; k < j; k++) { - int error = (1 << i) | (1 << j) | (1 << k); - int syn = syndrome(error); - errors[syn] = error; - } - } - } - - //code can't resolve quadruple errors - return errors; - } - - /** - * Encodes a 12 bit data word into a codeword. The 12 bits must be in the - * least significant positions and all other supplied bits must be zero. - * - * @param data a 12 bit data word - * @return the 24 bit code word - */ - - public static int encode(final int data) { - return sCodewords[data]; - } - - /** - * Computes the syndrome for the supplied codeword. The 24 bits must be in - * the least significant positions. - * - * @param word a candidate code word - * @return the syndrome for the supplied word - */ - - public static int syndrome(final int word) { - //multiply codeword by the check matrix - int syndrome = 0; - for (int j = 0; j < 12; j++) { - int d = word & sCheck[j]; - int p = Integer.bitCount(d); - syndrome = (syndrome << 1) | (p & 1); - } - return syndrome; - } - - /** - * Whether the supplied candidate code word is a valid code word. The 24 - * bits must be in the least significant positions and all other supplied - * bits must be zero. - * - * @param word the candidate code word - * @return true iff the supplied word is a valid codeword - */ - - public static boolean isCodeword(final int word) { - //optimization - is it worth it? - int w = Integer.bitCount(word); - if (w != 0 && w != 8 && w != 12 && w != 16 && w != 24) return false; - return syndrome(word) == 0; - } - - /** - * Decodes a valid code word into a dataword. - * - * @param codeword a valid code word - * @return the corresponding data word - */ - public static int decodeWord(final int codeword) { - return (codeword >> 12) & MASK; - } - - /** - * Attempts to correct and decode a codeword. The 24 bits must be in the - * least significant positions and all other supplied bits must be zero. - * NOTE: for codewords with four errors, this method does not attempt any correction - * - * @param word a word to be decoded - * @return a decoded and possibly corrected data word - */ - - public static int correctAndDecode(final int word) { - int err = sErrors[ syndrome(word) ]; - //for 4 errors we currently just give up!! - return err <= 0 ? decodeWord(word) : decodeWord(word ^ err); - } - - private float[] variance; - - // constructor - - /** - * Cannot be instantiated. - */ - - public Golay() { } - - - public static void main(String[] args) { - Random r = new Random(); - int d = 24; - - Golay sp = new Golay(); - MurmurHash hash = new MurmurHash(Integer.MAX_VALUE); - float testResolution = 10000f; - - for (int i = 0; i < 300; i++) { - int ct = 0; - float distavg = 0.0f; - for (int j = 0; j < testResolution; j++) { - float p1[] = new float[d]; - float p2[] = new float[d]; - - // generate a vector - for (int k = 0; k < d; k++) { - p1[k] = r.nextFloat() * 2 - 1; - p2[k] = (float) (p1[k] + r.nextGaussian() - * ((float) i / 1000f)); - } - float dist = VectorUtil.distance(p1, p2); - distavg += dist; - - long hp1 = hash.hash(sp.decode(p1)); - long hp2 = hash.hash(sp.decode(p2)); - - ct+=(hp2==hp1)?1:0; - - } - System.out.println(distavg / testResolution + "\t" + (float) ct - / testResolution); - } - } - -// float varTot = 1.0f; - @Override - public long[] decode(float[] p1) { - int codeword = 0; - if(p1[0]>0)codeword+=1; - for(int i=1;i<24;i++){ - codeword<<=1; - if(p1[i]>0)codeword+=1; - } - return new long[]{correctAndDecode(codeword)}; - } - -// @Override -// public void setVariance(float[] parameterObject) { -// variance = parameterObject; -// for(int i = 0 ; i - float[] rndBs; - // Vector> - float[][] stableArray; - - public PsdLSH(int M, int L, int D, int T, float W) { - this.M = M; - this.L = L; - this.T = T; - this.W = W; - this.D = D; - bits = (int) Math.ceil(Math.log(M) / Math.log(2)); - rndBs = new float[L]; - stableArray = new float[L][D]; - initialize(); - } - - public PsdLSH() { - M = 256; - L = 4; - T = GAUSSIAN; - W = 2f; - D = 32; - bits = (int) Math.ceil(Math.log(M) / Math.log(2)); - rndBs = new float[L]; - stableArray = new float[L][D]; - ; - initialize(); - } - - public PsdLSH(int psdtype, int innerDecoderMultiplier) { - M = 256; - L = 4; - T = psdtype; - if (psdtype == LEVY) - W = 2f; - if (psdtype == GAUSSIAN) - W = 1f; - if (psdtype == CAUCHY) - W = 2f; - D = innerDecoderMultiplier; - - bits = (int) Math.ceil(Math.log(M) / Math.log(2)); - rndBs = new float[L]; - stableArray = new float[L][D]; - ; - initialize(); - } - - private void initialize() { - - Random rng = new Random(); - - switch (T) { - case 0: { - LevyDistribution ld = new LevyDistribution(0, 1); - for (int l = 0; l < L; l++) { - int d = 0; - while (d < D) { - stableArray[l][d] = (float) ld.sample(); - if (stableArray[l][d] < 3f && stableArray[l][d] > -3f) { - d++; - } - } - rndBs[l] = rng.nextFloat() * W; - } - return; - } - - case 1: { - CauchyDistribution cd = new CauchyDistribution(); - - for (int l = 0; l < L; l++) { - int d = 0; - while (d < D) { - stableArray[l][d] = (float) cd.sample(); - if (stableArray[l][d] < 3f && stableArray[l][d] > -3f) { - d++; - } - } - - rndBs[l] = rng.nextFloat() * W; - } - return; - } - case 2: { - for (int l = 0; l < L; l++) { - for (int d = 0; d < D; d++) { - stableArray[l][d] = (float) rng.nextGaussian(); - } - rndBs[l] = rng.nextFloat() * W; - } - return; - } - default: { - return; - } - } - } - - long[] hash(float[] v) { - - long[] hashVal = new long[1]; - // long hashVal = 0; - int tmp; - for (int l = 0; l < L; l++) { - // dot product with stable distribution - float sum = rndBs[l]; - for (int d = 0; d < D; d++) { - sum += v[d] * stableArray[l][d]; - } - tmp = ((int) ((sum) / W)); - tmp %= M; - // shift negative number to the other side - hashVal[0] += tmp; - hashVal[0] <<= this.bits; - } - return hashVal; - } - - public static void main(String[] args) { - Random r = new Random(); - // int M = 256; - // int L = 8; - // int T = LEVY; - // float W = 1f; - int d = 24; - - PsdLSH sp = new PsdLSH(); - - // MultiDecoder sp = new MultiDecoder( d, e8); - MurmurHash hash = new MurmurHash(Integer.MAX_VALUE); - float testResolution = 10000f; - - HashMap ctmap = new HashMap(); - - for (int i = 0; i < 400; i++) { - int ct = 0; - float distavg = 0.0f; - for (int j = 0; j < testResolution; j++) { - float p1[] = new float[d]; - float p2[] = new float[d]; - - // generate a vector - for (int k = 0; k < d; k++) { - p1[k] = r.nextFloat() * 2 - 1f; - p2[k] = (float) (p1[k] + r.nextGaussian() - * ((float) i / 1000f)); - } - float dist = VectorUtil.distance(p1, p2); - distavg += dist; - long[] l1 = sp.decode(p1); - long[] l2 = sp.decode(p2); - - ctmap.put(l1[0], - ctmap.containsKey(l1[0]) ? 1 + ctmap.get(l1[0]) : 1); - - long hp1 = hash.hash(l1); - long hp2 = hash.hash(l2); - - // ctmap.put(hp1,ctmap.containsKey(hp1)?1+ctmap.get(hp1):1); - - ct += (hp2 == hp1) ? 1 : 0; - - } - - System.out.println(distavg / testResolution + "\t" + (float) ct - / testResolution); - } - } - -// @Override -// public void setVariance(float[] parameterObject) { -// this.variance = parameterObject; -// } - - @Override - public int getDimensionality() { - return D; - } - - @Override - public long[] decode(float[] f) { - return hash(f); - } - - @Override - public float getErrorRadius() { - return 1; - } - - @Override - public float getDistance() { - return 0; - } - - @Override - public boolean selfScaling() { - return true; - } - - @Override - public void setCounter(Countable counter) { - // TODO Auto-generated method stub - - } - -// @Override -// public float[] getVariance() { -// return this.variance; -// } - -} diff --git a/src/main/java/edu/uc/rphash/frequentItemSet/KHHCentroidCounterPush.java b/src/main/java/edu/uc/rphash/frequentItemSet/KHHCentroidCounterPush.java deleted file mode 100644 index 819135a..0000000 --- a/src/main/java/edu/uc/rphash/frequentItemSet/KHHCentroidCounterPush.java +++ /dev/null @@ -1,56 +0,0 @@ -package edu.uc.rphash.frequentItemSet; - -import java.util.Iterator; -import java.util.List; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.knee.KneeAlgorithm; - -public class KHHCentroidCounterPush extends KHHCentroidCounter { - - int estimatedKnee = 0; - KneeAlgorithm kne; - - public KHHCentroidCounterPush(float decay, KneeAlgorithm kne) { - super(1000, decay); - this.kne = kne; - } - - /* - * (non-Javadoc) - * - * @see edu.uc.rphash.frequentItemSet.KHHCentroidCounter#getTop() - */ - @Override - public List getTop() { - return super.getTop(); - } - - /** - * @see edu.uc.rphash.frequentItemSet.KHHCentroidCounter#add(edu.uc.rphash.Centroid) - * This method adds a new vector to the khhcounter and performs knee - * finding on the khhset - * @param the - * cluster to be added c - * @return the estimated number of clusters using the provided KneeAlgorithm - * if the estimation changes or -1 if it does not - */ - public int addAndUpdate(Centroid c) { - super.add(c); - // check for new clusters - int size = frequentItems.values().size(); - float[] counts = new float[size]; - Iterator it = frequentItems.values().iterator(); - for (int i = 0; it.hasNext(); i++) { - counts[i] = it.next().getCount(); - } - int tmpknee = kne.findKnee(counts); - if (tmpknee != estimatedKnee) { - estimatedKnee = tmpknee; - return estimatedKnee; - } else { - return -1; - } - } - -} diff --git a/src/main/java/edu/uc/rphash/knee/BiggestMergeKnee.java b/src/main/java/edu/uc/rphash/knee/BiggestMergeKnee.java deleted file mode 100644 index 2980d0d..0000000 --- a/src/main/java/edu/uc/rphash/knee/BiggestMergeKnee.java +++ /dev/null @@ -1,38 +0,0 @@ -package edu.uc.rphash.knee; - -import edu.uc.rphash.util.VectorUtil; - -public class BiggestMergeKnee implements KneeAlgorithm { - - @Override - public int findKnee(float[] data) { - - return data.length/2; - } - - - - - /** - * this function creates a linear model y=alpha*x+beta for the given data - * series x,y. - */ - float[] linest(float[] y) { - - int n = y.length; - float[] x = new float[n]; - for(int i = 0;imaxdist){ - maxdist = tmpdist; - argmax = i; - } - } - return argmax; - } - - - - /** - * this function creates a linear model y=alpha*x+beta for the given data - * series x,y. - */ - float[] linest(float[] y) { - - int n = y.length; - float[] x = new float[n]; - for(int i = 0;i>> 56); - s2[ct++] = (byte) (d >>> 48); - s2[ct++] = (byte) (d >>> 40); - s2[ct++] = (byte) (d >>> 32); - s2[ct++] = (byte) (d >>> 24); - s2[ct++] = (byte) (d >>> 16); - s2[ct++] = (byte) (d >>> 8); - s2[ct++] = (byte) (d); - - return computeCWowIntHash(s2, 0) % tablesize; - } - - @Override - public long hash(long[] s) { - byte[] s2 = new byte[s.length * 8]; - int ct = 0; - for (long d : s) { - s2[ct++] = (byte) (d >>> 56); - s2[ct++] = (byte) (d >>> 48); - s2[ct++] = (byte) (d >>> 40); - s2[ct++] = (byte) (d >>> 32); - s2[ct++] = (byte) (d >>> 24); - s2[ct++] = (byte) (d >>> 16); - s2[ct++] = (byte) (d >>> 8); - s2[ct++] = (byte) (d); - } - return computeCWowIntHash(s2, 0) % tablesize; - } - - public final static int CWOW_32_M = 0x57559429; - public final static int CWOW_32_N = 0x5052acdb; - public static final long LONG_LO_MASK = 0x00000000FFFFFFFFL; - - /** gather an int from the specified index into the byte array */ - public static final int gatherIntLE(byte[] data, int index) { - int i = data[index] & 0xFF; - i |= (data[++index] & 0xFF) << 8; - i |= (data[++index] & 0xFF) << 16; - i |= (data[++index] << 24); - return i; - } - - public static final int gatherPartialIntLE(byte[] data, int index, - int available) { - int i = data[index] & 0xFF; - if (available > 1) { - i |= (data[++index] & 0xFF) << 8; - if (available > 2) { - i |= (data[++index] & 0xFF) << 16; - } - } - return i; - } - - public int computeCWowIntHash(byte[] data, int seed) { - final int length = data.length; - /* cwfold( a, b, lo, hi ): */ - /* p = (u32)(a) * (u64)(b); lo ^=(u32)p; hi ^= (u32)(p >> 32) */ - /* cwmixa( in ): cwfold( in, m, k, h ) */ - /* cwmixb( in ): cwfold( in, n, h, k ) */ - int hVal = seed; - int k = length + seed + CWOW_32_N; - long p = 0; - int pos = 0; - int len = length; - while (len >= 8) { - int i1 = gatherIntLE(data, pos); - int i2 = gatherIntLE(data, pos + 4); - /* cwmixb(i1) = cwfold( i1, N, hVal, k ) */ - p = i1 * (long) CWOW_32_N; - k ^= p & LONG_LO_MASK; - hVal ^= (p >> 32); - /* cwmixa(i2) = cwfold( i2, M, k, hVal ) */ - p = i2 * (long) CWOW_32_M; - hVal ^= p & LONG_LO_MASK; - k ^= (p >> 32); - pos += 8; - len -= 8; - } - if (len >= 4) { - int i1 = gatherIntLE(data, pos); - /* cwmixb(i1) = cwfold( i1, N, hVal, k ) */ - p = i1 * (long) CWOW_32_N; - k ^= p & LONG_LO_MASK; - hVal ^= (p >> 32); - pos += 4; - len -= 4; - } - if (len > 0) { - int i1 = gatherPartialIntLE(data, pos, len); - /* cwmixb(i1) = cwfold( i1, N, hVal, k ) */ - p = (i1 & ((1 << (len * 8)) - 1)) * (long) CWOW_32_M; - hVal ^= p & LONG_LO_MASK; - k ^= (p >> 32); - } - p = (hVal ^ (k + CWOW_32_N)) * (long) CWOW_32_N; - k ^= p & LONG_LO_MASK; - hVal ^= (p >> 32); - hVal ^= k; - return hVal; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/ScalabilityTest.java b/src/main/java/edu/uc/rphash/tests/ScalabilityTest.java deleted file mode 100644 index b73826d..0000000 --- a/src/main/java/edu/uc/rphash/tests/ScalabilityTest.java +++ /dev/null @@ -1,130 +0,0 @@ -package edu.uc.rphash.tests; - -import java.util.ArrayList; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.Future; - -import edu.uc.rphash.RPHashAdaptive2PassParallel; -import edu.uc.rphash.RPHashSimpleParallel; -import edu.uc.rphash.RPHashStream; -import edu.uc.rphash.concurrent.VectorLevelConcurrency; -import edu.uc.rphash.tests.generators.GenerateStreamData; - -public class ScalabilityTest { - - public static long rphashstream(ArrayList vecsAndNoiseInThisRound, - int i, int k, GenerateStreamData gen1) { - RPHashStream rphit = new RPHashStream(k, gen1, i); - long timestart = System.nanoTime(); - //vecsAndNoiseInThisRound.parallelStream().map(vec-> - // VectorLevelConcurrency.computeSequential(vec, rphit.lshfuncs.get(0), rphit.is.get(0), rphit.getParam())); - for (float[] v : vecsAndNoiseInThisRound) - rphit.addVectorOnlineStep(v); - rphit.getCentroidsOfflineStep(); - - return System.nanoTime() - timestart; - } - - public static long rphashsimple(ArrayList vecsAndNoiseInThisRound, - int i, int k) { - RPHashSimpleParallel rphit = new RPHashSimpleParallel( - vecsAndNoiseInThisRound, k, i); - - long timestart = System.nanoTime(); - rphit.mapreduce1(); - rphit.mapreduce2(); - return System.nanoTime() - timestart; - } - - public static long rphashadaptive( - ArrayList vecsAndNoiseInThisRound, int i, int k) { - - RPHashAdaptive2PassParallel rphit = new RPHashAdaptive2PassParallel( - vecsAndNoiseInThisRound, k, i); - - long timestart = System.nanoTime(); - rphit.run(); - return System.nanoTime() - timestart; - } - - public static void scalability(int n) { - int k = 10; - int d = 1000; - float var = 1f; - Runtime rt = Runtime.getRuntime(); - // Random r = new Random(); - int NUM_Procs = rt.availableProcessors(); - - GenerateStreamData gen1 = new GenerateStreamData(k, d, var, 11331313); - - ArrayList vecsAndNoiseInThisRound = new ArrayList(n); - - // generate data in parallel - vecsAndNoiseInThisRound = gen1.genParallel(n); - - System.out.println(vecsAndNoiseInThisRound.size()); - System.out.printf("Threads\tSimple\tStream\tAdaptive\n"); - - long timesimple = 0, timeadaptive = 0, timestream = 0; - - for (int i = 1; i <= NUM_Procs; i++) { - - try { - //mix up the order - if(i%3==0){ - System.gc(); - Thread.sleep(1000); - timesimple = rphashsimple(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timeadaptive = rphashadaptive(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timestream = rphashstream(vecsAndNoiseInThisRound, i, k, gen1); - } - - if(i%3==1){ - System.gc(); - Thread.sleep(1000); - timeadaptive = rphashadaptive(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timesimple = rphashsimple(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timestream = rphashstream(vecsAndNoiseInThisRound, i, k, gen1); - } - - if(i%3==2){ - System.gc(); - Thread.sleep(1000); - timesimple = rphashsimple(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timestream = rphashstream(vecsAndNoiseInThisRound, i, k, gen1); - System.gc(); - Thread.sleep(1000); - timeadaptive = rphashadaptive(vecsAndNoiseInThisRound, i, k); - } - - System.out.printf("%d\t%.6f\t%.6f\t%.6f\n", i, - timesimple / 1e9f, timestream / 1e9f, - timeadaptive / 1e9f); - - } catch (Exception e) { - e.printStackTrace(); - System.out.println("Exception at Proc:" + String.valueOf(i)); - System.out.printf("%d\t%.6f\t%.6f\t%.6f\n", i, - timesimple / 1e9f, timestream / 1e9f, - timeadaptive / 1e9f); - - } - } - } - - public static void main(String[] args) throws InterruptedException { - ScalabilityTest.scalability(Integer.parseInt(args[0])); - - } -} diff --git a/src/main/java/edu/uc/rphash/tests/TestRPhash.java b/src/main/java/edu/uc/rphash/tests/TestRPhash.java index 2ff2498..6a6d8b3 100644 --- a/src/main/java/edu/uc/rphash/tests/TestRPhash.java +++ b/src/main/java/edu/uc/rphash/tests/TestRPhash.java @@ -5,7 +5,7 @@ import edu.uc.rphash.Centroid; import edu.uc.rphash.RPHashSimple; -import edu.uc.rphash.tests.clusterers.LloydIterativeKmeans; + import edu.uc.rphash.tests.generators.GenerateData; import edu.uc.rphash.util.VectorUtil; @@ -25,7 +25,7 @@ static void testRPHash(int k, int n,int d,float variance,int projdim){ long startTime = System.nanoTime(); - List M = ( new LloydIterativeKmeans(k,gen.data(),projdim)).getCentroids(); + long duration = (System.nanoTime() - startTime); // List aligned = VectorUtil.alignCentroids(M,gen.medoids()); diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/AdaptiveMeanShift.java b/src/main/java/edu/uc/rphash/tests/clusterers/AdaptiveMeanShift.java deleted file mode 100644 index 122dac3..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/AdaptiveMeanShift.java +++ /dev/null @@ -1,432 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.kdtree.KDTreeNN; -import edu.uc.rphash.kdtree.naiveNN; -import edu.uc.rphash.lsh.LSHkNN; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -/* Adaptive Mean Shift (AMS) Algorithm - * - * - * Mean Shift algorithm based on methods described by Fukunaga and Hostetler - * 'Estimation of the Gradient of a Density Function, with Applications - * in Pattern Recognition' ( - * - * http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1055330 - * - * - * - * Additional Kernel and Optimizations described by Cheng - * 'Mean Shift, Mode Seeking, and Clustering' (1995) - * - * http://dl.acm.org/citation.cfm?id=628711 - * - * - * - * Adaptive Mean Shift algorithm based on ... - * - * - */ - - -//TODO: Add labels to points for centroids -//TODO: add weights to centroid merging -> rphash (cardinality) -//TODO: windowMode -> Sample Point Estimator - -final class cStore{ - public int count; - public float[] centroid; - public float wcsse = 0; - public Centroid cent; - - public void addPoint(float[] point){ - this.count++; - this.wcsse += VectorUtil.distance(point, centroid); - this.cent.setCount(this.count); - //TODO: this.cent.setWCSS(this.wcsse); - } - - public cStore(float[] centroid){ - this.count = 0; - this.cent = new Centroid(centroid,0); - this.centroid = centroid; - this.wcsse = 0; - } - - public cStore(float[] window, float[] point) { - // TODO Auto-generated constructor stub - this.count = 0; - this.cent = new Centroid(window,0); - this.centroid = window; - this.wcsse = VectorUtil.distance(point, window); - } - -} - - -public class AdaptiveMeanShift implements Clusterer { - - List data; //global data storage - List centroids; //global centroid storage - private RPHashObject so; - private List cs; - - //Parameters - double h = 1; // bandwidth - - int kernelMode = 0; // mode (0:uniform; 1:gaussian) - - int windowMode = 1; // Determine how to perform the Adaptive Window - // 0 - No adaptivity; Basic Mean Shift - // 1 - Balloon Estimator - // 2 - Sample Point Estimator (TODO) - - int knnAlg = 2; //Determine what KNN algorithm to use - // 0 - kNN Naive - // 1 - kNN LSH - // 2 - KD-TREE kNN - - int k = 5; //Number of KNN points for adaptive window - - Clusterer weightClusters = null; - - - static int maxiters = 10000; //Max iterations before breaking search for convergence - float convergeValue = (float) 0.00001; //maximum change in each dimension to 'converge' - float blurPercent = (float) 2; //Amount to blur centroids to group similar Floats - - //TEST Parameters: - boolean debug = false; //Control Debug Output - boolean minimalOutput = true; //Print the minimal final output (pretty print) - boolean printCentroids = true; //Print out centroids (not pretty) - Set cent = new HashSet(); //Storage for grouping the clusters - - public void setMode(int mode){ this.kernelMode = mode; } - - public void setH(double h) { this.h = h; } - - public void setWinMode(int winMode){ this.windowMode = winMode; } - - public List getData() { return data; } - - public void setRawData(List data){ this.data = data; } - - - public AdaptiveMeanShift(){ - this.centroids = new ArrayList(); - this.cs = new ArrayList(); - } - - public AdaptiveMeanShift(int k, List data){ - this.k = k; - this.data = data; - this.centroids = new ArrayList(); - } - - public AdaptiveMeanShift(int h, int windowMode, int kernelMode, int k, List data){ - this.h = h; - this.windowMode = windowMode; - this.kernelMode = kernelMode; - this.k = k; - this.data = data; - this.centroids = new ArrayList(); - this.cs = new ArrayList(); - } - - public AdaptiveMeanShift(int h, int windowMode, int kernelMode, int k, List data, Clusterer c){ - this.h = h; - this.weightClusters = c; - this.windowMode = windowMode; - this.kernelMode = kernelMode; - this.k = k; - this.data = data; - this.centroids = new ArrayList(); - this.cs = new ArrayList(); - } - - public float calcMode(float curWindow, float workingData){ - float mPoint = 0; - float kern = 0; - - if (kernelMode == 0) //Uniform - mPoint = workingData; - else if (kernelMode == 1){ //Gaussian - float c = (float) (1.0/Math.pow(h,2)); - kern = (float) Math.exp(-c * Math.pow(workingData - curWindow, 2)); - mPoint = (float) kern * (workingData - curWindow); - } - - return mPoint; - } - - public void adaptH(List data, int curPoint, LSHkNN knnHandle, KDTreeNN kdHandle, naiveNN naiveHandle){ - if(windowMode == 0) //No adaptivity - return; - else if(windowMode == 1){ //Balloon - if(knnAlg == 0){ - h = Math.sqrt(naiveHandle.getNNEuc(k, data.get(curPoint))); - printDebug("naiveH: " + h); - } - if(knnAlg == 1){ - List retData = knnHandle.knn(k, data.get(curPoint)); - h = VectorUtil.distance(retData.get(retData.size() - 1),data.get(curPoint)); - printDebug("LSHH: " + h); - } - if(knnAlg == 2){ - h = Math.sqrt(kdHandle.treeNNEuc(k, data.get(curPoint))); - printDebug("KDH: " + h + "\n"); - } - - return; - } - else if(windowMode == 2){ //KNN sample point estimator - return; - } - } - - - - public void cluster(List data){ - LSHkNN knnHandle = null; - KDTreeNN kdHandle = null; - naiveNN naiveHandle = null; - if(windowMode == 1){ - if(knnAlg == 0){ - naiveHandle = new naiveNN(data); - } - if(knnAlg == 1){ - knnHandle = new LSHkNN(data.get(0).length,5); - knnHandle.createDB(data); - } - if(knnAlg == 2){ - kdHandle = new KDTreeNN(); - kdHandle.createTree(data); - } - } - - - for(int i = 0; i < data.size(); i++){ - - float[] curWindow = new float[data.get(0).length]; - float[] bufWindow = new float[data.get(0).length]; - boolean converge = false; - int m = 0; - int winCount = 0; - - for(int t = 0; t < data.get(0).length; t++){ - curWindow = data.get(i).clone(); - } - - adaptH(data, i, knnHandle, kdHandle, naiveHandle); - - while((!converge) && (m < maxiters)){ - m++; - bufWindow = curWindow.clone(); - - for(int t = 0; t < data.get(0).length; t++){ - curWindow[t] = (float) 0; - } - - for(int x = 0; x < data.size(); x++){ - - if(VectorUtil.distance(bufWindow, data.get(x)) <= h){ - winCount++; - - for(int n = 0; n < data.get(x).length; n++){ - curWindow[n] = curWindow[n] + calcMode(bufWindow[n], data.get(x)[n]); - } - } - } - - if(winCount > 0){ - boolean convergeTest = true; - - for(int y = 0; y < curWindow.length; y++){ - if(curWindow[y] >= convergeValue) - convergeTest = false; - } - - if(kernelMode == 0){ - for(int y = 0; y < curWindow.length; y++){ - curWindow[y] = curWindow[y] / winCount; - } - } - if(kernelMode >= 1){ - for(int y = 0; y < curWindow.length; y++){ - curWindow[y] = curWindow[y] / winCount; - curWindow[y] = bufWindow[y] + curWindow[y]; - printDebug("New Window: " + curWindow[y]); - } - printDebug("_______________________________________"); - } - - - //Check for convergence - if(Arrays.equals(curWindow,bufWindow) || convergeTest){ - boolean add = true; - if(centroids.indexOf(curWindow) >= 0){ - add = false; - } - add = checkAllCentroids(curWindow, data.get(i)); - - if(add){ - String str = ""; - for(int j = 0; j < curWindow.length; j++){str += Float.toString(curWindow[j]) + ",";} - cent.add(str + "\n"); - } - - converge = true; - } - bufWindow = curWindow.clone(); - } - - m = 0; - winCount = 0; - } - } - - for(cStore cen: cs){ - Centroid it = new Centroid(cen.centroid, 0); - it.setCount(cen.count); - //TODO: it.setWCSS(cen.wcsse); - centroids.add(it); - - } - } - - - public boolean checkAllCentroids(float[] window, float[] point){ - float[] centroid; - for(cStore cz : cs){ - centroid = cz.centroid; - double percentDiff = 0; - - for(int z = 0; z < centroid.length; z++){ - percentDiff = percentDiff + Math.abs(1-(centroid[z] / window[z])); - } - - percentDiff = percentDiff / centroid.length; - - if(percentDiff < blurPercent){ - cz.addPoint(point); - return false; - } - - } - - cs.add(new cStore(window, point)); - return true; - } - - - void run(){ - if(this.weightClusters != null){ - //this.weightClusters.setData(this.data); - } - - cluster(this.data); - } - - - public void printDebug(String s){ - if(debug) - System.out.println(s); - } - - - public static void main(String[] args){ - int genClusters = 3; - int genRowsPerCluster =100; - int genColumns = 100; - - AdaptiveMeanShift ams = new AdaptiveMeanShift(); - - if(ams.data == null){ - GenerateData gen = new GenerateData(genClusters,genRowsPerCluster, genColumns); - ams.data = gen.data; - } - - ams.run(); - if(ams.printCentroids){ - System.out.println("Centroid Count: " + ams.centroids.size()); - for(Centroid c: ams.centroids){ - System.out.println("WCSS = " + c.getWCSS()); - System.out.print("Cent = "); - for(int z = 0; z < c.centroid().length; z++) - System.out.print(c.centroid()[z] + ","); - System.out.println("\n\n"); - } - } - if(ams.minimalOutput){ - System.out.println("\n\nh: " + ams.h); - System.out.println("Kernel Mode: " + ams.kernelMode); - System.out.println("Window Mode: " + ams.windowMode); - System.out.println("k (KNN): " + ams.k + "\n"); - System.out.println("Number of Clusters: " + ams.cent.size() + "\n"); - System.out.println(ams.cent.toString().replaceAll(", ", " ")); - } - - System.out.println("\n\nDone!"); - } - - - @Override - public List getCentroids() { - if(this.centroids.size() == 0) - run(); - return this.centroids; - } - - @Override - public RPHashObject getParam() { - so = new SimpleArrayReader(this.data, k); - return so; - } - - @Override - public void setK(int getk) { - this.k = getk; - } - - @Override - public void setWeights(List counts) { - // TODO Auto-generated method stub - if(data != null) { - - } - - - } - - @Override - public void setData(List centroids) { - ArrayList data = new ArrayList(centroids.size()); - for(Centroid c : centroids)data.add(c.centroid()); - setRawData(data); - } - - @Override - public void reset(int randomseed) { - // TODO Auto-generated method stub - this.centroids = null; - - } - - @Override - public boolean setMultiRun(int runs) { - // Return true to ignore multi-run (deterministic) - return true; - } - -} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative.java b/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative.java deleted file mode 100644 index bedf99d..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative.java +++ /dev/null @@ -1,151 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.List; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -public class Agglomerative implements Clusterer{ - - int k; - List clusters; - List data; - float[][] distances; - List counts; - public Agglomerative() - { - - } - public Agglomerative(int k, List data) - { - this.k = k; - this.data = data; - this.clusters = null; - counts = new ArrayList(); - for(int i = 0;i data){ - float[][] distances = new float[data.size()][data.size()]; - - for(int i = 0 ; i < data.size();i++) - { - for(int j = 0; j < data.size();j++) - distances[i][j] =VectorUtil.distance(data.get(i), data.get(j)); - } - return distances; - } - - - private float[] avgVector(float[] u, float[] v, Float float1, Float float2){ - float[] w = new float[u.length]; - for(int i = 0 ;i < u.length;i++)w[i] = (u[i]*float1+v[i]*float2)/(float1+float2); - return w; - } - - private void merge() - { - float minimum = 1000000f; - int mini = 0; - int minj = 0; - int i = 0 ; - for(float[] l : distances) - { - for(int j = 0; j < data.size();j++){ - if(l[j]k) - merge(); - } - - public static void main(String[] args){ - GenerateData gen = new GenerateData(3,500,2); - List data =gen.data; - float[][] dists = distanceArray(data); -// double[] weights = new double[data.size()]; - - - String[] s = new String[dists.length]; - for(int i = 0;i< dists.length;i++)s[i] = String.valueOf(i); - - Agglomerative agl = new Agglomerative(3, data); - agl.run(); - for(float[] cent: gen.getMedoids()){ - for(float f : cent)System.out.print(f+" "); - System.out.println(); - } - System.out.println("computed"); - - for(Centroid cent: agl.getCentroids()){ - for(float f : cent.centroid())System.out.print(f+" "); - System.out.println(); - } - - } - - @Override - public List getCentroids() { - if(clusters==null)run(); - List cents = new ArrayList<>(clusters.size()); - for(float[] v : this.clusters)cents.add(new Centroid(v,0)); - return cents; - } - - @Override - public void reset(int randomseed) { - clusters = null; - } - - - @Override - public RPHashObject getParam() { - // TODO Auto-generated method stub - return null; - } - - @Override - public void setWeights(List counts) { - //this.counts = counts; - counts = new ArrayList(); - } - @Override - public void setK(int getk) { - this.k = getk; - } - @Override - public void setData(List centroids) { - this.data = new ArrayList(centroids.size()); - for(Centroid c : centroids) data.add(c.centroid()); - } - @Override - public void setRawData(List centroids) { - this.data = centroids; - } - - @Override - public boolean setMultiRun(int runs) { - //agglomerative is deterministic running multiple times is moot - return true; - } -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative2.java b/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative2.java deleted file mode 100644 index ad5cd4a..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative2.java +++ /dev/null @@ -1,372 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.TreeSet; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -public class Agglomerative2 implements Clusterer { - private class DistAndVector implements Comparable { - Float dist; - Integer vec; - - @Override - public int compareTo(DistAndVector o) { - // if (equals(o)) - // return 0; - if (dist.floatValue() == o.dist.floatValue()) - return 1; - if (dist.floatValue() < o.dist.floatValue()) - return -1; - return 1; - - } - } - - private class PQAndVector implements Comparable { - TreeSet pq; - Integer vec; - - PQAndVector(Integer vec) { - this.pq = new TreeSet(); - this.vec = vec; - } - - // @Override - // public boolean equals(Object o) { - // - // return ((PQAndVector) o).vec.intValue() == vec.intValue(); - // } - - @Override - public int compareTo(PQAndVector r) { - - // if (equals(r)){ - // System.out.println("whatthehell"+ r.vec.intValue() + ":"+ - // this.vec.intValue()); - // return 0; - // - // } - if (pq.isEmpty()) { - return 1; - } - if (r.pq.isEmpty()) { - return -1; - } - - if (pq.first().dist == r.pq.first().dist) { - return 1; - } - return pq.first().compareTo(r.pq.first()); - } - } - - int k; - TreeSet outerpq = new TreeSet(); - List data; - float counts[]; - - private void distanceArray(List data) { - int n = data.size(); - for (int i = 0; i < n - 1; i++) { - PQAndVector innerpq = new PQAndVector(new Integer(i)); - - for (int j = i + 1; j < n; j++) { - DistAndVector dv = new DistAndVector(); - dv.dist = new Float(VectorUtil.distance(data.get(i), - data.get(j))); - dv.vec = new Integer(j); - innerpq.pq.add(dv); - } - // - // System.out.print(i+" : "); - // for(Object p: - // innerpq.pq.toArray())System.out.print(((DistAndVector)p).vec+", ");System.out.println(); - outerpq.add(innerpq); - } - // for(PQAndVector p: - // outerpq)System.out.print(p.vec+", ");System.out.println(); - - } - - private void distanceArray2(List data2,List projIDs) { - int n = data.size(); - for (int i = 0; i < n - 1; i++) { - PQAndVector innerpq = new PQAndVector(new Integer(i)); - - for (int j = i + 1; j < n; j++) { - DistAndVector dv = new DistAndVector(); - if(projIDs.get(i).equals(projIDs.get(j))){ - dv.dist = Float.MAX_VALUE; - } - else{ - dv.dist = new Float(VectorUtil.distance(data.get(i), - data.get(j))); - } - - - dv.vec = new Integer(j); - innerpq.pq.add(dv); - } - outerpq.add(innerpq); - } - } - - private void mergeAndUpdateCentroids(int newdata, int olddata) - { - float[] u = data.get(newdata); - float[] v = data.get(olddata); - float ct1 = counts[newdata]; - float ct2 = counts[olddata]; - float[] w = new float[u.length]; - for (int i = 0; i < u.length; i++) - w[i] = (u[i] * ct1 + v[i] * ct2) / (ct1 + ct2); - counts[newdata] += counts[olddata]; - data.set(newdata, w); - - } - - /** - * remove the next two nearest vectors and perform a counts weighted average - * of the vectors. put this vector in the lower of the two vector indeces. - */ - private void merge() { - // pop the queue with the nearest top vector in it - - PQAndVector innerpq = outerpq.pollFirst(); - //lower id lists are not checked for removals, check here. - while (innerpq.pq.isEmpty()) { - innerpq = outerpq.pollFirst(); - } - // pop the nearest vector - DistAndVector dv = innerpq.pq.pollFirst(); - - int newvecloc = innerpq.vec; - int olddata = dv.vec; - - Iterator it = outerpq.iterator(); - while (it.hasNext()) { - PQAndVector v = it.next(); - if (v.vec.intValue() == olddata) { - - it.remove(); - break; - } - } - - // merge the two vectors - mergeAndUpdateCentroids(newvecloc, olddata); - - PQAndVector newpq = new PQAndVector(newvecloc); - - Iterator pqit = outerpq.iterator(); - while (pqit.hasNext()) { - PQAndVector itpq = pqit.next(); - - // remove the merged vectors from all inner lists - Iterator itdv = itpq.pq.iterator(); - while (itdv.hasNext()) { - DistAndVector v = itdv.next(); - if (v.vec.intValue() == newvecloc) { - itdv.remove(); - break; - } - } - - itdv = itpq.pq.iterator(); - while (itdv.hasNext()) { - DistAndVector v = itdv.next(); - if ( v.vec.intValue() == olddata) { - itdv.remove(); - break; - } - } - } - - // System.out.println("lists after removal"); - // printlists(); - - pqit = outerpq.iterator(); - while (pqit.hasNext()) { - PQAndVector itpq = pqit.next(); - // add the new vector distance to all upper parent lists - // compute new distance to vecs who have this vector in their list - if (itpq.vec < newvecloc) { - DistAndVector dv3 = new DistAndVector(); - dv3.dist = new Float(VectorUtil.distance(data.get(newvecloc), - data.get(itpq.vec))); - - dv3.vec = new Integer(newvecloc); - - // add the updated vector to the new lists - itpq.pq.add(dv3); - } - } - - // System.out.println("lists after adding back into lower idx lists"); - // printlists(); - - pqit = outerpq.iterator(); - while (pqit.hasNext()) { - PQAndVector itpq = pqit.next(); - // add to the new merge list - if (itpq.vec > newvecloc) { - DistAndVector dv2 = new DistAndVector(); - dv2.dist = new Float(VectorUtil.distance(data.get(newvecloc), - data.get(itpq.vec))); - dv2.vec = new Integer(itpq.vec); - newpq.pq.add(dv2); - } - } - - outerpq.add(newpq); - - // System.out.println("adding merged list back"); - // printlists(); - - } - - private void printlists() { - System.out.println(); - for (Object o : outerpq.toArray()) { - System.out.print("\t" + ((PQAndVector) o).vec + " : "); - for (Object p : ((PQAndVector) o).pq.toArray()) { - System.out.print(((DistAndVector) p).vec + ", "); - } - System.out.println(); - } - } - - private void run() { - while (outerpq.size() > k) { - - merge(); - } - Iterator pqit = outerpq.iterator(); - centroids = new ArrayList(); - while (pqit.hasNext()) { - PQAndVector innerpq = pqit.next(); - centroids.add(data.get(innerpq.vec)); - } - } - - public static void main(String[] args) { - - for (int i = 0; i < 1000; i += 10) { - long avgtime = 0; - float avgdistagg = 0; - float avgdistreal = 0; - float avgdistkm = 0; - if(i!=0){ - for (int j = 0; j < 5; j++) { - GenerateData gen = new GenerateData(10, i, 10, .5f); - List data = gen.data; - - long timestart = System.currentTimeMillis(); - Clusterer km1 = new LloydIterativeKmeans(10, data); - Clusterer ag1 = new Agglomerative2(10, data); - avgdistagg+=StatTests.WCSSECentroidsFloat(ag1.getCentroids(), data); - avgdistkm+=StatTests.WCSSECentroidsFloat(km1.getCentroids(), data); - avgdistreal+=StatTests.WCSSE(gen.getMedoids(), data); - avgtime += (System.currentTimeMillis() - timestart); - } - } - System.out.println(i + "\t" + avgtime / 5+"\t"+avgdistagg/5f+"\t"+avgdistkm/5f+"\t"+avgdistreal/5f); - } - - } - - List centroids; - - @Override - public List getCentroids() { - if(centroids==null)run(); - List cents = new ArrayList<>(centroids.size()); - for(float[] v : this.centroids)cents.add(new Centroid(v,0)); - return cents; - } - - @Override - public void reset(int randomseed) { - centroids = null; - } - - @Override - public RPHashObject getParam() { - return null; - } - - public void printDistanceArray() { - for (int i = 0; i < data.size(); i++) { - for (int j = 0; j < data.size(); j++) - System.out.printf("%.2f,", - VectorUtil.distance(data.get(i), data.get(j))); - System.out.println(); - } - System.out.println(); - } - - public Agglomerative2(int k, List data) { - this.k = k; - this.data = data; - this.counts = new float[data.size()]; - for (int i = 0; i < counts.length; i++) - counts[i] = 1; - - distanceArray(data); - - } - - public Agglomerative2(int k, List data, List counts) { - this.k = k; - this.data = data; - this.counts = new float[counts.size()]; - for (int i = 0; i < counts.size(); i++) - this.counts[i] = counts.get(i); - distanceArray(data); - } - - public Agglomerative2(int k, List data, List counts,List projectionIDs) { - this.k = k; - this.data = data; - this.counts = new float[counts.size()]; - for (int i = 0; i < counts.size(); i++) - this.counts[i] = counts.get(i); - distanceArray2(data,projectionIDs); - } - - @Override - public void setWeights(List counts) { - - } - - @Override - public void setData(List centroids) { - this.data = new ArrayList(centroids.size()); - for(Centroid c : centroids) data.add(c.centroid()); - } - @Override - public void setRawData(List centroids) { - this.data = centroids; - } - - @Override - public void setK(int getk) { - this.k = getk; - } - - @Override - public boolean setMultiRun(int runs) { - //agglomerative is deterministic running multiple times is moot - return true; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/Kmeans.java b/src/main/java/edu/uc/rphash/tests/clusterers/Kmeans.java deleted file mode 100644 index 2f92ac1..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/Kmeans.java +++ /dev/null @@ -1,330 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.projections.DBFriendlyProjection; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -import org.apache.commons.lang3.ArrayUtils; -//import org.rosuda.JRI.REXP; -//import org.rosuda.JRI.Rengine; - -public class Kmeans implements Clusterer { - int k; - int n; - List data; - public int getK() { - return k; - } - - public void setK(int k) { - this.k = k; - } - - public List getData() { - return data; - } - - @Override - public void setData(List centroids) { - this.data = new ArrayList(centroids.size()); - for(Centroid c : centroids) data.add(c.centroid()); - } - @Override - public void setRawData(List centroids) { - this.data = centroids; - } - - public List getWeights() { - return weights; - } - - public void setWeights(List weights) { - this.weights = weights; - } - - int projdim; - -// List means; -// List kmeansCentroids = new ArrayList(); - List> clusters; - List weights; -// Rengine re; -// -// public void setRengine(Rengine re) { -// this.re = re; -// } - - public Kmeans(int k, List data) { - this.k = k; - this.data = data; - this.projdim = 0; - this.clusters = null; - this.weights = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - weights.add(1f); -// means = null; - } - - public Kmeans(int k, List data, List weights) { - this.k = k; - this.data = data; - this.projdim = 0; - this.clusters = null; - this.weights = weights; -// means = null; - } - - public Kmeans(int k, List data, int projdim) { - this.k = k; - this.data = data; - this.projdim = projdim; - this.clusters = null; - this.weights = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - weights.add(1f); -// means = null; - } - -// public Kmeans(Rengine re) { -// // TODO Auto-generated constructor stub -// this.re = re; -// } - - /* - public float[] computerCentroid(List vectors, List data) { - int d = data.get(0).length; - float[] centroid = new float[d]; - - for (int i = 0; i < d; i++) - centroid[i] = 0.0f; - - float w_total = 0f; - for (Integer v : vectors) { - w_total += weights.get(v); - } - - for (Integer v : vectors) { - float[] vec = data.get(v); - float weight = (float) weights.get(v) / (float) w_total; - for (int i = 0; i < d; i++) - centroid[i] += (vec[i] * weight); - } - return centroid; - } - - - ArrayList weightTotals; - - void updateMeans(List data) { - weightTotals = new ArrayList(); - if (means == null) { - means = new ArrayList(); - for (int i = 0; i < k; i++) - means.add(computerCentroid(clusters.get(i), data)); - } - for (int i = 0; i < k; i++) - means.set(i, computerCentroid(clusters.get(i), data)); - } - - int assignClusters(List data) { - int swaps = 0; - List> newClusters = new ArrayList>(); - for (int j = 0; j < k; j++) - newClusters.add(new ArrayList()); - - for (int clusterid = 0; clusterid < k; clusterid++) { - - for (Integer member : clusters.get(clusterid)) { - - int nearest = VectorUtil.findNearestDistance(data.get(member), - means); - newClusters.get(nearest).add(member); - if (nearest != clusterid) - swaps++; - } - - } - clusters = newClusters; - return swaps; - } - - - private void run() { - int maxiters = 1000; - int swaps = 2; - this.n = this.data.size(); - ArrayList workingdata = new ArrayList(); - // stuff for projected kmeans - Projector p = null; - Random r = new Random(); - if (projdim != 0) - p = new DBFriendlyProjection(this.data.get(0).length, projdim, - r.nextInt()); - for (float[] v : this.data) { - if (p != null) { - workingdata.add(p.project(v)); - } else - workingdata.add(v); - } - - int maxout = 0; - //loop until there are no more nullsets - boolean nullset = false; - do { - this.clusters = new ArrayList>(k); - // seed data with new clusters - ArrayList shufflelist = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - shufflelist.add(i); - - for (int i = 0; i < k; i++) { - List tmp = new LinkedList(); - tmp.add(shufflelist.remove(0)); - - for (int j = 1; j < workingdata.size() / k ; j++) { - int nxt = r.nextInt(shufflelist.size()); - tmp.add(shufflelist.remove(nxt)); - } - this.clusters.add(tmp); - } - - - cluster(maxiters, swaps, n, workingdata, clusters); - - nullset = false; - - for (List cluster : clusters) { - nullset |= (cluster.size() == 0); - } - - } while (nullset && ++maxout<100); - if (maxout == 100) - System.err.println("Warning: MaxIterations Reached Outer"); - - } - - public void cluster(int maxiters, int swaps, int n, - ArrayList workingdata, List> clusters) { - while (swaps > 0 && maxiters > 0) { - maxiters--; - updateMeans(workingdata); - swaps = assignClusters(workingdata); - } - if (maxiters == 0) - System.err.println("Warning: MaxIterations Reached"); - updateMeans(this.data); - } - */ - - public Kmeans() { - // TODO Auto-generated constructor stub - } - - @Override - public List getCentroids() { - // if (means == null) { - // run(); - -// Rengine re = Rengine.getMainEngine(); -// if(re == null) -// re = new Rengine(new String[] {"--no-save"}, false, null); - -// if (!re.waitForR()) -// System.out.println("Cannot load R"); - - ArrayList workingdata = new ArrayList(); - for (float[] v : this.data) - workingdata.add(v); - List kmeansCentroids = new ArrayList(); - - // Convert List data to a 2D array - float[][] matrix = new float[workingdata.size()][]; - matrix = workingdata.toArray(matrix); - - // Get the number of rows and columns of the 2D array - int rows = matrix.length; - String numRows = String.valueOf(rows); - - int cols = matrix[0].length; - String numCols = String.valueOf(cols); - - // Set k - String kAsString = String.valueOf(k); - - // Convert the 2D array to a 1D double array to feed into R - double[] oneDArray = flatten(matrix); - -// // Feed the 1D array, k and number of rows and columns to R -// re.assign("data", oneDArray); -// re.assign("numberOfRows", numRows); -// re.assign("numberOfCols", numCols); -// re.assign("k", kAsString); -// -// // Create the data matrix in R -// re.eval("dataMatrix <- matrix(data, nrow = as.numeric(numberOfRows), ncol = as.numeric(numberOfCols), byrow = TRUE)"); -// -// // Run k-means in R -// double[][] kmOut = re.eval("kmeans(dataMatrix, as.numeric(k), nstart = 25)$centers").asDoubleMatrix(); - - // Convert the 2D array back to List format -// for (int i = 0; i < kmOut.length; i++) { -// float[] vector = new float[kmOut[0].length]; -// for (int j = 0; j < kmOut[0].length; j++) -// vector[j] = (float) kmOut[i][j]; -// kmeansCentroids.add(vector); -// } -// re.end(); - // } - List l = new ArrayList<>(); - for(float[] f : kmeansCentroids) - l.add(new Centroid(f,0)); - return l; - } - - // Convert a 2D array to a 1D double array - public static double[] flatten(float[][] twoDArray) { - ArrayList oneDArray = new ArrayList(); - - for (int i = 0; i < twoDArray.length; i++) - for (int j = 0; j < twoDArray[i].length; j++) - oneDArray.add((double) twoDArray[i][j]); - - Double[] doubles = oneDArray.toArray(new Double[0]); - double[] d = ArrayUtils.toPrimitive(doubles); - - return d; - } - - @Override - public void reset(int randomseed) { - - } - - public static void main(String[] args) { - GenerateData gen = new GenerateData(8, 100, 100); - Kmeans kk = new Kmeans(5, gen.data(), 24); -// VectorUtil.prettyPrint(kk.getCentroids()); - } - - @Override - public RPHashObject getParam() { - return new SimpleArrayReader(this.data, k); - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/LloydIterativeKmeans.java b/src/main/java/edu/uc/rphash/tests/clusterers/LloydIterativeKmeans.java deleted file mode 100644 index 23ecf97..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/LloydIterativeKmeans.java +++ /dev/null @@ -1,250 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.projections.DBFriendlyProjection; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -public class LloydIterativeKmeans implements Clusterer { - int k; - int n; - List data; - public int getK() { - return k; - } - - public void setK(int k) { - this.k = k; - } - - public List getData() { - return data; - } - - - @Override - public void setRawData(List data) { - this.data = data; - } - - @Override - public void setData(List centroids) { - ArrayList data = new ArrayList(centroids.size()); - for(Centroid c : centroids)data.add(c.centroid()); - setRawData(data); - } - - public List getWeights() { - return weights; - } - - public void setWeights(List weights) { - this.weights = weights; - } - - int projdim; - - List means; - List> clusters; - List weights; - - public LloydIterativeKmeans(int k, List data) { - this.k = k; - this.data = data; - this.projdim = 0; - this.clusters = null; - this.weights = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - weights.add(1f); - means = null; - } - - public LloydIterativeKmeans(int k, List data, List weights) { - this.k = k; - this.data = data; - this.projdim = 0; - this.clusters = null; - this.weights = weights; - means = null; - } - - public LloydIterativeKmeans(int k, List data, int projdim) { - this.k = k; - this.data = data; - this.projdim = projdim; - this.clusters = null; - this.weights = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - weights.add(1f); - means = null; - } - - public LloydIterativeKmeans() { - // TODO Auto-generated constructor stub - } - - public float[] computeCentroid(List vectors, List data) { - int d = data.get(0).length; - float[] centroid = new float[d]; - - for (int i = 0; i < d; i++) - centroid[i] = 0.0f; - - float w_total = 0f; - for (Integer v : vectors) { - w_total += weights.get(v); - } - - for (Integer v : vectors) { - float[] vec = data.get(v); - float weight = (float) weights.get(v) / (float) w_total; - for (int i = 0; i < d; i++) - centroid[i] += (vec[i] * weight); - } - return centroid; - } - - ArrayList weightTotals; - - void updateMeans(List data) { - weightTotals = new ArrayList(); - if (means == null) { - means = new ArrayList(); - for (int i = 0; i < k; i++) - means.add(computeCentroid(clusters.get(i), data)); - } - for (int i = 0; i < k; i++) - means.set(i, computeCentroid(clusters.get(i), data)); - } - - int assignClusters(List data) { - int swaps = 0; - List> newClusters = new ArrayList>(); - for (int j = 0; j < k; j++) - newClusters.add(new ArrayList()); - - for (int clusterid = 0; clusterid < k; clusterid++) { - - for (Integer member : clusters.get(clusterid)) { - - int nearest = VectorUtil.findNearestDistance(data.get(member), - means); - newClusters.get(nearest).add(member); - if (nearest != clusterid) - swaps++; - } - - } - clusters = newClusters; - return swaps; - } - - private void run() { - int maxiters = 1000; - int swaps = 2; - this.n = this.data.size(); - ArrayList workingdata = new ArrayList(); - // stuff for projected kmeans - Projector p = null; - Random r = new Random(); - if (projdim != 0) - p = new DBFriendlyProjection(this.data.get(0).length, projdim, - r.nextInt()); - for (float[] v : this.data) { - if (p != null) { - workingdata.add(p.project(v)); - } else - workingdata.add(v); - } - - int maxout = 0; - //loop until there are no more nullsets - boolean nullset = false; - do { - this.clusters = new ArrayList>(k); - // seed data with new clusters - ArrayList shufflelist = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - shufflelist.add(i); - - for (int i = 0; i < k; i++) { - List tmp = new LinkedList(); - tmp.add(shufflelist.remove(0)); - - for (int j = 1; j < workingdata.size() / k ; j++) { - int nxt = r.nextInt(shufflelist.size()); - tmp.add(shufflelist.remove(nxt)); - } - this.clusters.add(tmp); - } - - - cluster(maxiters, swaps, n, workingdata, clusters); - - nullset = false; - - for (List cluster : clusters) { - nullset |= (cluster.size() == 0); - } - - } while (nullset && ++maxout<100); - if (maxout == 100) - System.err.println("Warning: MaxIterations Reached Outer"); - - } - - public void cluster(int maxiters, int swaps, int n, - ArrayList workingdata, List> clusters) { - while (swaps > 0 && maxiters > 0) { - maxiters--; - updateMeans(workingdata); - swaps = assignClusters(workingdata); - } - if (maxiters == 0) - System.err.println("Warning: MaxIterations Reached"); - updateMeans(this.data); - } - - @Override - public List getCentroids() { - if (means == null) - run(); - List centroids = new ArrayList<>(means.size()); - for(float[] f : means){ - centroids.add(new Centroid(f,0)); - } - return centroids; - } - - @Override - public void reset(int randomseed) { - means = null; - } - - public static void main(String[] args) { - GenerateData gen = new GenerateData(8, 100, 100); - LloydIterativeKmeans kk = new LloydIterativeKmeans(5, gen.data(), 24); -// VectorUtil.prettyPrint(kk.getCentroids()); - } - - @Override - public RPHashObject getParam() { - - return new SimpleArrayReader(this.data, k); - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/MLE2.java b/src/main/java/edu/uc/rphash/tests/clusterers/MLE2.java deleted file mode 100644 index 4519987..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/MLE2.java +++ /dev/null @@ -1,333 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; - -/** - * @author lee - * learns mle model with T topics from words x docs counts data - */ -/* - * All private methods are so because they for speed reasons do not employ any - * form of checking for numerical stability. In the case of mle matrices this is - * acceptable as probability matrices are never negative and the dimensions of - * the matrices do not change. - * for a unit test of mle, mle will be used to produce the NMF of a matrix, functionality - * and correctness can be confirmed by finding the product to be equal to the input - * matrix - */ -public class MLE2 implements Clusterer { - - - - /** - * @param args - */ - public static void main(String[] args) { - float[][] F = {{1f, 0f, 0f, 2f, 0f},{1f, 1f, 1f, 0f, 0f}, {0f, 0f, 1f, 1f , 1f},{2f, 1f, 1f, 0f, 1f},{1f, 0f, 0f, 2f, 0f},{1f, 1f, 1f, 0f, 0f}, {0f, 0f, 1f, 1f , 1f},{2f, 1f, 1f, 0f, 1f}}; - for(float[] ff: F){ - System.out.println(); - for(float f: ff){ - System.out.print(f + " "); - } - }System.out.println(); - - - MLE2 mlobj = new MLE2(Arrays.asList(F),4,.00001f); - - printmat(normalize(F)); - printmat(mlobj.wt);printmat(mlobj.td); - printmat( multiply(mlobj.wt,mlobj.td)); - - } - - - int W;//words, rows - int D;//documents, columns - int T;// topics or latent classes - - public float[][] td; - public float[][] wt; - Listdata; - - public MLE2(List counts, int T,float epsilon) - { - this.data = counts; - W=counts.size(); - D = counts.get(0).length; - this.T = T; - mle(epsilon); - } - - // use if you want wt initialized to some specific value - public void mle( float epsilon) - { - float tot = sum(data); - td = normalize(ones(T,D)); - wt = normalize(rand(W,T)); - - - float[] E = sum1D(logDotProduct(data,multiply(wt,td))); - float F = sum(E)/tot; - float F_new ; - float rel_ch; - - - do - { - // Expectation Step - // td = norm(td .* ( wt' * ( counts ./ (wt * td) ) )); - td = normalize(dotProduct(td,(multiply(transpose(wt),dotDivide(data,multiply(wt,td)))))); - - //maximization step - //wt = normalize( wt .* ( ( counts ./ ( wt * td + eps ) ) * td' )) - wt = normalize(dotProduct(wt,multiply(dotDivide(data,multiply(wt,td)),transpose(td)))); - - //calculate log-likelihood - /* - * ___ ___ - * \ \ - * /__ /__ n(d,w) log P(d,w) - * d c D w c W - */ - E = sum1D(logDotProduct(data,multiply(wt,td))); - F_new = sum(E)/tot; - - //calculate iteration's relative change to determine convergence - rel_ch = Math.abs((F_new - F))/ Math.abs(F); - F= F_new; - - System.out.println(rel_ch); - - }while(rel_ch>epsilon); - - } - - - //testing status - works - //gets the pairwise products of two matrices - //no dimension checking - private static float[][] dotProduct(float[][] mat1, float[][] mat2) - { - float[][] rtrn = new float [mat1.length ][mat1[0].length]; - - for(int i = 0;i mat1, float[][] mat2) - { - float[][] rtrn = new float [mat1.size() ][mat1.get(0).length]; - - for(int i = 0;i mat1, float[][] mat2) - { - float[][] rtrn = new float [mat1.size() ][mat1.get(0).length]; - - for(int i = 0;i A) - { - float sum =0; - for(float[] ff:A){ - for(float f:ff) sum+=f; - } - return sum; - } - - //testing status - works - //give the sum of all the elements of a vector - public static float sum(float[]A) - { - float sum =0; - for(float f:A) sum+=f; - return sum; - } - - //testing status - works - //give the column vector sum of all the elements of a matrix - public static float[] sum1D(float[][] A) - { - float[] sum = new float[A[0].length]; - Arrays.fill(sum, 0f); - for(float[] ff:A) - { - for(int i = 0;i getCentroids() { - - // TODO Auto-generated method stub - return null; - } - - @Override - public RPHashObject getParam() { - return new SimpleArrayReader(this.data, T); - } - - @Override - public void setWeights(List counts) { - - } - - @Override - public void setRawData(List data) { - this.data = data; - } - - @Override - public void setData(List centroids) { - ArrayList data = new ArrayList(centroids.size()); - for(Centroid c : centroids)data.add(c.centroid()); - setRawData(data); - } - - @Override - public void setK(int getk) { - } - - @Override - public void reset(int randomseed) { - - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/MaxLikelihoodKMeans2.java b/src/main/java/edu/uc/rphash/tests/clusterers/MaxLikelihoodKMeans2.java deleted file mode 100644 index e3f72fa..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/MaxLikelihoodKMeans2.java +++ /dev/null @@ -1,451 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Vector; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.RPHashStream; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.generators.GenerateStreamData; - -public class MaxLikelihoodKMeans2 implements Clusterer { - - class PointND { - - private int dimension; // number of coordinates of a point - private float[] coordinates; // the coordinates of a point - - /** - * Create a point centered at the origin of the specific dimension - **/ - public PointND(int dimension) { - this.dimension = dimension; - coordinates = new float[dimension]; - } - - public PointND(float[] data) { - this.dimension = data.length; - coordinates = data; - } - - /** - * Create a new point identical to point p - **/ - public PointND(PointND p) { - this.dimension = p.dimension; - this.coordinates = new float[dimension]; - for (int i = 0; i < dimension; i++) - this.coordinates[i] = p.coordinates[i]; - } - } - - private int n; // number of instances to classify - private int d; // number of coordinates of each point - private int k; // number of clusters - private PointND[] mu; // coordinate of means mu[j] of each cluster j - private Vector[] w; // holds the points classified into each class - // w[j] - private PointND[] sigma; // holds the standard deviation of each class i - private float[] prior; // holds the prior of each class i - // private float logLikelihood; // holds the log likelihood of each of the k - // Gaussians - private float MDL; // the minimum description length of the model - private int numIterations; - - private List centroids; - private PointND[] data; - - public MaxLikelihoodKMeans2(int getk, List data) { - this.data = new PointND[data.size()]; - for (int i = 0; i < data.size(); i++) { - this.data[i] = new PointND(data.get(i)); - } - this.centroids = null; - init(this.data, getk); - } - - public MaxLikelihoodKMeans2() { - } - - /** - * Intialize the parameters of the k-means algorithm Randomly assign a point - * in x to each mean mu[j] - **/ - private void init(PointND[] x, int k) { - this.n = x.length; - this.d = x[0].dimension; - this.k = k; - this.mu = new PointND[k]; - this.w = new Vector[k]; - this.numIterations = 0; - this.sigma = new PointND[k]; - this.prior = new float[k]; - - // randomly assign a point in x to each mean mu[j] - PointND randomPoint; - for (int j = 0; j < k; j++) { - randomPoint = x[(int) (Math.random() * (n - 1))]; - mu[j] = new PointND(randomPoint); - // each prior and standard deviation are set to zero - sigma[j] = new PointND(d); - prior[j] = 0; - } - } - - /** - * Runs the k-means algorithm with k clusters on the set of instances x Then - * find the quality of the model - **/ - public void run(PointND[] x, int k, float epsilon) { - float maxDeltaMeans = epsilon + 1; - PointND[] oldMeans = new PointND[k]; - // initialize n,k,mu[j] - init(x, k); - // iterate until there is no change in mu[j] - while (maxDeltaMeans > epsilon) { - // remember old values of the each mean - for (int j = 0; j < k; j++) { - oldMeans[j] = new PointND(mu[j]); - - } - - // classify each instance x[i] to its nearest class - // first we need to clear the class array since we are reclassifying - for (int j = 0; j < k; j++) { - w[j] = new Vector(); // could use clear but then have - // to init... - } - - for (int i = 0; i < n; i++) { - classify(x[i]); - } - // recompute each mean - computeMeans(); - // compute the largest change in mu[j] - maxDeltaMeans = maxDeltaMeans(oldMeans); - numIterations++; - } - } - - /** - * Classifies the point x to the nearest class - **/ - private void classify(PointND x) { - float dist = 0; - float smallestDist; - int nearestClass; - - // compute the distance x is from mean mu[0] - smallestDist = distance(x.coordinates, mu[0].coordinates); - nearestClass = 0; - - // compute the distance x is from the other classes - for (int j = 1; j < k; j++) { - dist = distance(x.coordinates, mu[j].coordinates); - if (dist < smallestDist) { - smallestDist = dist; - nearestClass = j; - } - } - // classify x into class its nearest class - w[nearestClass].add(x); - } - - float distance(float[] x, float[] y) { - float ret = 0.0f; - if (x.length != y.length) - return Float.MAX_VALUE; - for (int i = 0; i < x.length; i++) - ret += (x[i] - y[i]) * (x[i] - y[i]); - return (float) Math.sqrt(ret); - } - - float[] subtract(float[] x, float[] y) { - float[] ret = new float[x.length]; - if (x.length != y.length) - return null; - for (int i = 0; i < x.length; i++) - ret[i] = x[i] - y[i]; - return ret; - } - - float[] add(float[] x, float[] y) { - float[] ret = new float[x.length]; - if (x.length != y.length) - return null; - for (int i = 0; i < x.length; i++) - ret[i] = x[i] + y[i]; - return ret; - } - - float[] multiply(float[] x, float scalar) { - float[] ret = new float[x.length]; - for (int i = 0; i < x.length; i++) - ret[i] = x[i] * scalar; - return ret; - } - - public float max(float[] coordinates) { - float value; - float max = coordinates[0]; - for (int i = 1; i < coordinates.length; i++) { - value = coordinates[i]; - if (value > max) - max = value; - } - return max; - } - - /** - * Recompute mu[j] as the average of all points classified to the class w[j] - **/ - private void computeMeans() { - int numInstances; // number of instances in each class w[j] - PointND instance; - - // init the means to zero - for (int j = 0; j < k; j++) - mu[j] = new PointND(mu[j].dimension); - - // recompute the means of each cluster - for (int j = 0; j < k; j++) { - numInstances = w[j].size(); - for (int i = 0; i < numInstances; i++) { - instance = w[j].get(i); - mu[j] = new PointND( - add(mu[j].coordinates, instance.coordinates)); - // mu[j].add(instance); - } - // mu[j].multiply(1.0f / numInstances); - mu[j] = new PointND( - multiply(mu[j].coordinates, 1.0f / numInstances)); - } - - } - - /** - * Compute the maximum change over each mean mu[j] - **/ - private float maxDeltaMeans(PointND[] oldMeans) { - float delta; - oldMeans[0] = new PointND(subtract(oldMeans[0].coordinates, - mu[0].coordinates)); - // oldMeans[0].subtract(mu[0]); - - float maxDelta = max(oldMeans[0].coordinates); - for (int j = 1; j < k; j++) { - // oldMeans[j].subtract(mu[j]); - oldMeans[j] = new PointND(subtract(oldMeans[j].coordinates, - mu[j].coordinates)); - delta = max(oldMeans[j].coordinates); - if (delta > maxDelta) - maxDelta = delta; - } - return maxDelta; - } - - // /** - // * Compute the standard deviation of the k Gaussians - // **/ - // private void computeDeviation() { - // int numInstances; // number of instances in each class w[j] - // PointND instance; - // PointND temp; - // - // // set the standard deviation to zero - // for (int j = 0; j < k; j++) - // sigma[j].setToOrigin(); - // - // // for each cluster j... - // for (int j = 0; j < k; j++) { - // numInstances = w[j].size(); - // for (int i = 0; i < numInstances; i++) { - // instance = (PointND) (w[j].get(i)); - // temp = new PointND(instance); - // temp.subtract(mu[j]); - // temp.pow(2.0f); // (x[i]-mu[j])^2 - // temp.multiply(1.0f / numInstances); // multiply by proba of - // // having x[i] in cluster j - // sigma[j].add(temp); // sum i (x[i]-mu[j])^2 * p(x[i]) - // } - // sigma[j].pow((1.0f / 2f)); // because we want the standard deviation - // } - // } - // - // /** - // * Compute the priors of the k Gaussians - // **/ - // private void computePriors() { - // float numInstances; // number of instances in each class w[j] - // for (int j = 0; j < k; j++) { - // numInstances = w[j].size() * (1.0f); - // prior[j] = numInstances / n; - // } - // } - // - // /** - // * Assume the standard deviations and priors of each cluster have been - // * computed - // **/ - // private void computeLogLikelihood(PointND[] x) { - // float temp1 = 0; - // float temp2 = 0; - // PointND variance; - // float ln2 = (float) Math.log(2); - // // for each instance x - // for (int i = 0; i < n; i++) { - // // for each cluster j - // temp1 = 0; - // for (int j = 0; j < k; j++) { - // temp1 = temp1 + (x[i].normal(mu[j], sigma[j]) * prior[j]); - // } - // temp2 = (float) (temp2 + Math.log(temp1) / ln2); - // } - // logLikelihood = temp2; - // } - // - // /** - // * Assume the log likelihood and priors have been computed - // **/ - // private void computeMDL() { - // float temp = 0; - // float numInstances; - // float ln2 = (float) Math.log(2); - // for (int j = 0; j < k; j++) { - // numInstances = w[j].size(); - // for (int i = 0; i < d; i++) { - // temp = (float) (temp - Math.log(sigma[j].getCoordinate(i) - // / Math.sqrt(numInstances)) - // / ln2); - // } - // } - // MDL = temp - logLikelihood; - // } - - public float getMDL() { - return MDL; - } - - public List getCentroids() { - float epsilon = 0.01f; - if (centroids != null){ - - return centroids; - } - init(data, k); - run(data, d, epsilon); - centroids = new ArrayList(k); - for (int i = 0; i < k; i++) - centroids.add(new Centroid(mu[i].coordinates,0)); - - // compute sum of squares - double sigtotal = 0.0; - for (int i = 0; i < sigma.length; i++) - for (int j = 0; j < sigma[i].dimension; j++) - sigtotal += sigma[i].coordinates[j]; - - return centroids; - } - - @Override - public RPHashObject getParam() { - // TODO Auto-generated method stub - return null; - } - - @Override - public void setWeights(List counts) { - - } - - @Override - public void setData(List data) { - this.centroids = null; - this.data = new PointND[data.size()]; - for (int i = 0; i < data.size(); i++) { - this.data[i] = new PointND(data.get(i).centroid()); - } - } - @Override - public void setRawData(List data) { - this.centroids = null; - this.data = new PointND[data.size()]; - for (int i = 0; i < data.size(); i++) { - this.data[i] = new PointND(data.get(i)); - } - } - - @Override - public void setK(int getk) { - this.k = getk; - } - - @Override - public void reset(int randomseed) { - centroids = null; - } - - public static void main(String[] args) { - int k = 10; - int d = 240; - float var = 1f; - int interval = 1000; - Runtime rt = Runtime.getRuntime(); - - GenerateStreamData gen1 = new GenerateStreamData(k, d, var, 11331313); - GenerateStreamData noise = new GenerateStreamData(1, d, var * 10, - 11331313); - MaxLikelihoodKMeans2 km2 = new MaxLikelihoodKMeans2(); - // HartiganWongKMeans hwkm = new HartiganWongKMeans(); - - System.out.printf("\tKMeans\t\t\tNull\t\tReal\n"); - System.out - .printf("Vecs\tMem(KB)\tTime\tWCSSE\t\tTime\tWCSSE\t\tWCSSE\n"); - - long timestart = System.nanoTime(); - for (int i = 0; i < 2500000;) { - ArrayList vecsAndNoiseInThisRound = new ArrayList(); - ArrayList justvecsInThisRound = new ArrayList(); - - for (int j = 1; j < interval && i < 2500000; i++, j++) { - float[] vec = gen1.generateNext(); - vecsAndNoiseInThisRound.add(vec); - justvecsInThisRound.add(vec); - vecsAndNoiseInThisRound.add(noise.generateNext()); - } - - timestart = System.nanoTime(); - km2.setRawData(vecsAndNoiseInThisRound); - km2.setK(k); - - List cents = km2.getCentroids(); - long time = System.nanoTime() - timestart; - - rt.gc(); - long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - - double wcsse = StatTests.WCSSECentroidsFloat(cents, justvecsInThisRound); - double realwcsse = StatTests.WCSSE(gen1.medoids, - justvecsInThisRound); - System.out.printf("%d\t%d\t%.4f\t%.1f\t\t", i, usedkB, - time / 1000000000f, wcsse); - - cents = new HartiganWongKMeans(k, vecsAndNoiseInThisRound) - .getCentroids(); - time = System.nanoTime() - timestart; - usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - wcsse = StatTests.WCSSECentroidsFloat(cents, justvecsInThisRound); - System.out.printf("%.4f\t%.1f\t\t%.1f\n", time / 1000000000f, - wcsse, realwcsse); - } - } - @Override - public boolean setMultiRun(int runs) { - return false; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/SVD.java b/src/main/java/edu/uc/rphash/tests/clusterers/SVD.java deleted file mode 100644 index da67542..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/SVD.java +++ /dev/null @@ -1,456 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Collections; - -public class SVD { - /** - * returns U in a. normaly U is nr*nr, but if nr>nc only the first nc - * columns are returned (nice, saves memory). The columns of U have - * arbitrary sign, also the columns corresponding to near-zero singular - * values can vary wildly from other implementations. - *This function is adapted from the c coded method from Numerical Recipes in C - */ - - private float[][] A; - SVDMatrix svdmat; - - public SVD(float[][] A) - { - this.A=A; - } - - public void compute() - { - - - - float[] D = new float[A[0].length]; - float[][] V = new float[A[0].length][A[0].length]; - - svdmat = new SVDMatrix(A,D,V,A.length < A[0].length); - - svd(svdmat.getU(),svdmat.getD(),svdmat.getV()); - } - - public float[][] getU(){ - return svdmat.getV(); - } - - public float[][] getD() - { - return padV(svdmat.getD()); - } - - public float[][] getVT(){ - return transpose(svdmat.getV()); - } - - public void svd(float[][] a, float[] w, float[][] v) { - int i, its, j, jj, k, l = 0, nm = 0; - boolean flag; - int m = a.length; - int n = a[0].length; - float c, f, h, s, x, y, z; - float anorm = 0.f, g = 0.f, scale = 0.f; - float[] rv1 = new float[n]; - - for (i = 0; i < n; i++) { - l = i + 1; - rv1[i] = scale * g; - g = s = scale = 0.f; - if (i < m) { - for (k = i; k < m; k++) - scale += abs(a[k][i]); - if (scale != 0.0) { - for (k = i; k < m; k++) { - a[k][i] /= scale; - s += a[k][i] * a[k][i]; - } - f = a[i][i]; - g = -SIGN((float)Math.sqrt(s), f); - h = f * g - s; - a[i][i] = f - g; - // if (i!=(n-1)) { // CHECK - for (j = l; j < n; j++) { - for (s = 0, k = i; k < m; k++) - s += a[k][i] * a[k][j]; - f = s / h; - for (k = i; k < m; k++) - a[k][j] += f * a[k][i]; - } - // } - for (k = i; k < m; k++) - a[k][i] *= scale; - } - } - w[i] = scale * g; - g = s = scale = 0.0f; - if (i < m && i != n - 1) { // - for (k = l; k < n; k++) - scale += abs(a[i][k]); - if (scale != 0.) { - for (k = l; k < n; k++) { // - a[i][k] /= scale; - s += a[i][k] * a[i][k]; - } - f = a[i][l]; - g = -SIGN((float)Math.sqrt(s), f); - h = f * g - s; - a[i][l] = f - g; - for (k = l; k < n; k++) - rv1[k] = a[i][k] / h; - if (i != m - 1) { // - for (j = l; j < m; j++) { // - for (s = 0, k = l; k < n; k++) - s += a[j][k] * a[i][k]; - for (k = l; k < n; k++) - a[j][k] += s * rv1[k]; - } - } - for (k = l; k < n; k++) - a[i][k] *= scale; - } - } // i= 0; --i) { - if (i < n - 1) { // - if (g != 0.) { - for (j = l; j < n; j++) - v[j][i] = (a[i][j] / a[i][l]) / g; - for (j = l; j < n; j++) { - for (s = 0, k = l; k < n; k++) - s += a[i][k] * v[k][j]; - for (k = l; k < n; k++) - v[k][j] += s * v[k][i]; - } - } - for (j = l; j < n; j++) - // - v[i][j] = v[j][i] = 0.0f; - } - v[i][i] = 1.0f; - g = rv1[i]; - l = i; - } - // for (i=IMIN(m,n);i>=1;i--) { // ! - // for (i = n-1; i>=0; --i) { - for (i = Math.min(m - 1, n - 1); i >= 0; --i) { - l = i + 1; - g = w[i]; - if (i < n - 1) // - for (j = l; j < n; j++) - // - a[i][j] = 0.0f; - if (g != 0.) { - g = 1.f / g; - if (i != n - 1) { - for (j = l; j < n; j++) { - for (s = 0, k = l; k < m; k++) - s += a[k][i] * a[k][j]; - f = (s / a[i][i]) * g; - for (k = i; k < m; k++) - a[k][j] += f * a[k][i]; - } - } - for (j = i; j < m; j++) - a[j][i] *= g; - } else { - for (j = i; j < m; j++) - a[j][i] = 0.0f; - } - a[i][i] += 1.0; - } - for (k = n - 1; k >= 0; --k) { - for (its = 1; its <= 30; ++its) { - flag = true; - for (l = k; l >= 0; --l) { - nm = l - 1; - if ((abs(rv1[l]) + anorm) == anorm) { - flag = false; - break; - } - if ((abs(w[nm]) + anorm) == anorm) - break; - } - if (flag) { - c = 0.0f; - s = 1.0f; - for (i = l; i <= k; i++) { // - f = s * rv1[i]; - rv1[i] = c * rv1[i]; - if ((abs(f) + anorm) == anorm) - break; - g = w[i]; - h = pythag(f, g); - w[i] = h; - h = 1.0f / h; - c = g * h; - s = -f * h; - for (j = 0; j < m; j++) { - y = a[j][nm]; - z = a[j][i]; - a[j][nm] = y * c + z * s; - a[j][i] = z * c - y * s; - } - } - } // flag - z = w[k]; - if (l == k) { - if (z < 0.) { - w[k] = -z; - for (j = 0; j < n; j++) - v[j][k] = -v[j][k]; - } - break; - } // l==k - x = w[l]; - nm = k - 1; - y = w[nm]; - g = rv1[nm]; - h = rv1[k]; - f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2 * h * y); - g = pythag(f, 1.0f); - f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x; - c = s = 1.0f; - for (j = l; j <= nm; j++) { - i = j + 1; - g = rv1[i]; - y = w[i]; - h = s * g; - g = c * g; - z = pythag(f, h); - rv1[j] = z; - c = f / z; - s = h / z; - f = x * c + g * s; - g = g * c - x * s; - h = y * s; - y *= c; - for (jj = 0; jj < n; jj++) { - x = v[jj][j]; - z = v[jj][i]; - v[jj][j] = x * c + z * s; - v[jj][i] = z * c - x * s; - } - z = pythag(f, h); - w[j] = z; - if (z != 0.0) { - z = 1.0f / z; - c = f * z; - s = h * z; - } - f = c * g + s * y; - x = c * y - s * g; - for (jj = 0; jj < m; ++jj) { - y = a[jj][j]; - z = a[jj][i]; - a[jj][j] = y * c + z * s; - a[jj][i] = z * c - y * s; - } - } // j= 0. ? abs(a) : -abs(a)); - } - - //creates a diagonal matrix by padding the vector(v) with zeros - public static float[][] padV(float[] v) - { - float rtrn[][] = new float[v.length][v.length]; - for(int i =0;i svdpairs; - float[] D; - float[][] U; - float[][]V; - boolean sorted; - - - public SVDMatrix(float[][] u, float[] d, float[][]v, boolean transpose) - { - transpose = false; - sorted = false; - - /* if(transpose){ - D=d; - U=transpose(v); - V=u; - }else - { - */ D=d; - U=u; - V=v; - // } - } - - public void sortSingularValues() - { - svdpairs = new ArrayList(D.length); - - for(int i = 0; i< D.length;i++){ - float urow[] = new float[U[0].length]; - float vrow[] = new float[V[0].length]; - - for(int j = 0; j { - float singularValue; - float[] urows; - float[] vrows; - - public SVDValuePairs(float singularValue, float[] urows,float[] vrows){ - this.singularValue = singularValue; - this.vrows = vrows; - this.urows = urows; - } - - public int compareTo(SVDValuePairs o) { - if(o instanceof SVDValuePairs) - return 0; - if(((SVDValuePairs)o).singularValue == this.singularValue)return 0; - if(((SVDValuePairs)o).singularValue > this.singularValue)return 1; - return -1; - } - } - } -} diff --git a/src/main/java/edu/uc/rphash/tests/testStreamingRPHash.java b/src/main/java/edu/uc/rphash/tests/testStreamingRPHash.java deleted file mode 100644 index ac62432..0000000 --- a/src/main/java/edu/uc/rphash/tests/testStreamingRPHash.java +++ /dev/null @@ -1,184 +0,0 @@ -package edu.uc.rphash.tests; - -import java.util.ArrayList; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.RPHashMultiProj; -import edu.uc.rphash.RPHashStream; -import edu.uc.rphash.tests.clusterers.StreamingKmeans; -import edu.uc.rphash.tests.clusterers.StreamingKmeans2; -import edu.uc.rphash.tests.generators.GenerateStreamData; -import edu.uc.rphash.util.VectorUtil; - -public class testStreamingRPHash { - public static void readFileData(String[] args) throws Exception { - - int interval = 1000; - int k = 10; - String filename = "/home/lee/Desktop/Dimension3204/data.mat"; - int processors = Runtime.getRuntime().availableProcessors(); - if (args.length > 1) - filename = args[0]; - if (args.length > 2) - k = Integer.parseInt(args[1]); - if (args.length > 3) - processors = Integer.parseInt(args[0]); - - Runtime rt = Runtime.getRuntime(); - List data = VectorUtil.readFile(filename, false); - - RPHashStream rphit = new RPHashStream(data, k); - - // System.out.printf("Running Streaming RPHash on %d processors, d=%d,k=%d,n=%d\n",rphit.getProcessors(),d,k,interval); - // StreamClusterer rphit = new StreamingKmeans(data, k); - // System.out.printf("Running Streaming KMeans on %d processors, d=%d,k=%d\n",1,data.size(),k); - - System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\n"); - long timestart = System.nanoTime(); - - timestart = System.nanoTime(); - rphit.addVectorOnlineStep(data.get(0)); - for (int i = 1; i < 20000; i++) { - rphit.addVectorOnlineStep(data.get(i)); - - if (i % interval == 0) { - List cents = rphit.getCentroidsOfflineStep(); - long time = System.nanoTime() - timestart; - - rt.gc(); - long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - - double wcsse = StatTests.WCSSECentroidsFloat(cents, data); - - System.gc(); - System.out.printf("%d\t%d\t%.4f\t%.0f\n", i, usedkB, - time / 1000000000f, wcsse); - timestart = System.nanoTime(); - } - } - - } - - public static void generateAndStream() throws InterruptedException { - Random r = new Random(); - int k = 10; - int d = 1000; - float var = 1f; - int interval = 10000; - Runtime rt = Runtime.getRuntime(); - - GenerateStreamData gen1 = new GenerateStreamData(k, d, var, 11331313); - GenerateStreamData noise = new GenerateStreamData(1, d, var*10, 11331313); - RPHashStream rphit = new RPHashStream(k, gen1,rt.availableProcessors()); - StreamingKmeans2 skmi = new StreamingKmeans2(k, gen1,rt.availableProcessors()); - System.out.printf("\tStreamingRPHash\t\t\tStreamingKmeans\t\tReal\n"); - System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\t\tTime\tWCSSE\t\tWCSSE\n"); - - long timestart = System.nanoTime(); - for (int i = 0; i < 2500000;) { - ArrayList vecsAndNoiseInThisRound = new ArrayList(); - ArrayList justvecsInThisRound = new ArrayList(); - - for (int j = 1; j < interval && i < 2500000; i++, j++){ - float[] vec = gen1.generateNext(); - vecsAndNoiseInThisRound.add(vec); - justvecsInThisRound.add(vec); - if(r.nextInt(10)==1) - vecsAndNoiseInThisRound.add(noise.generateNext()); - } - - timestart = System.nanoTime(); - for (float[] f : vecsAndNoiseInThisRound) { - rphit.addVectorOnlineStep(f); - } - List cents = rphit.getCentroidsOfflineStep(); - long time = System.nanoTime() - timestart; - - rt.gc(); - long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - - double wcsse = StatTests.WCSSECentroidsFloat(cents, justvecsInThisRound); - double realwcsse = StatTests.WCSSE(gen1.medoids, justvecsInThisRound); - - System.out.printf("%d\t%d\t%.4f\t%.1f\t\t", i, usedkB, - time / 1000000000f, wcsse); - rt.gc(); - Thread.sleep(1000); - rt.gc(); - - timestart = System.nanoTime(); - for (float[] f : vecsAndNoiseInThisRound) { - skmi.addVectorOnlineStep(f); - } - - cents = skmi.getCentroidsOfflineStep(); - time = System.nanoTime() - timestart; - - rt.gc(); - usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - - wcsse = StatTests.WCSSECentroidsFloat(cents, justvecsInThisRound); - // recreate vectors at execution time to check average - rt.gc(); - Thread.sleep(1000); - rt.gc(); - - System.out.printf("%.4f\t%.1f\t\t%.1f\n",time/ 1000000000f,wcsse,realwcsse); - } - } - - public static void streamingPushtest() { - int k = 10; - int d = 1000; - float var = 4.5f; - - GenerateStreamData gen1 = new GenerateStreamData(k, d, var, 11331313); - - RPHashStream rphit = new RPHashStream(k,gen1); - - ArrayList cts = new ArrayList(); - for (int i = 0; i < 10000; i++) { - long centroidCount = rphit.addVectorOnlineStep(gen1.generateNext()); -// if (centroidCount>1 ) { -// cts.add((int) centroidCount); -// List f = rphit.getTopIdSizes(); -// for (float ff : f) -// System.out.print(ff/(float)i + ","); -// System.out.print("]\n["); -// } - } - //System.out.println(cts.toString()); - } - - public static void main(String[] args) throws Exception { -// readFileData(args); - generateAndStream(); -// streamingPushtest(); - } - - static void prettyPrint(List cs){ - - int n = cs.get(0).centroid.length; - boolean curtailm = n > 10; - if (curtailm) { - for (int i = 0; i < 4; i++) { - VectorUtil.prettyPrint(cs.get(i).centroid); - } - for (int j = 0; j < n / 2; j++) - System.out.print("\t"); - System.out.print(" ...\n"); - for (int i = cs.size() - 4; i < cs.size(); i++) { - VectorUtil.prettyPrint(cs.get(i).centroid); - } - } else { - for (int i = 0; i < cs.size(); i++) { - VectorUtil.prettyPrint(cs.get(i).centroid); - System.out.print("\n"); - } - } - - } - -} From 6fb8e771ba4602619856de7c96464165b635b079 Mon Sep 17 00:00:00 2001 From: deysn Date: Tue, 3 Jul 2018 03:26:25 -0400 Subject: [PATCH 02/29] Adding the options for TWRP from the main RPHASH class to run from the command line. --- src/main/java/edu/uc/rphash/RPHash.java | 8 +- .../edu/uc/rphash/RPHashAdaptive2Pass.java | 63 +++--- .../edu/uc/rphash/Readers/RPHashObject.java | 2 +- src/main/java/edu/uc/rphash/TWRP1.java | 27 +-- src/main/java/edu/uc/rphash/TWRPv2.java | 205 ++++++------------ .../projections/DBFriendlyProjection.java | 20 +- 6 files changed, 119 insertions(+), 206 deletions(-) diff --git a/src/main/java/edu/uc/rphash/RPHash.java b/src/main/java/edu/uc/rphash/RPHash.java index 2977bc4..9038026 100644 --- a/src/main/java/edu/uc/rphash/RPHash.java +++ b/src/main/java/edu/uc/rphash/RPHash.java @@ -51,7 +51,7 @@ public class RPHash { static String[] clusteringmethods = { "simple", "streaming", "multiproj", - "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" }; + "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" ,"twrp"}; static String[] offlineclusteringmethods = { "singlelink", "completelink", "averagelink", "kmeans", "adaptivemeanshift", "kmpp", "multikmpp" , "dbscan", "none" }; static String[] projectionmethods = { "dbf", "fjlt", "rp", "svd", "noproj" }; @@ -720,6 +720,12 @@ public static List runConfigs(List untaggedArgs, runitems.add(new RPHashAdaptive2Pass(o)); break; } + + case "twrp": { + runitems.add(new TWRPv2(o)); + break; + } + case "dummy": { runitems.add(new DummyClusterer(so)); break; diff --git a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java index 1ff2008..88a9637 100644 --- a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java +++ b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java @@ -170,32 +170,22 @@ public List> findDensityModes() { } - for (Long name: IDAndCent.keySet()){ - - String key =name.toString(); - - - String value = IDAndCent.get(name).toString() ; - - // String value1 = Arrays.toString(value.toString()); - - System.out.println(key + " " + value); - - - - -} - - for (Long name: IDAndID.keySet()){ - - String key =name.toString(); - String value = IDAndID.get(name).toString(); - System.out.println(key + " " + value); - - -} - - +// for (Long name: IDAndCent.keySet()){ +// +// String key =name.toString(); +// // String value = IDAndCent.get(name).toString() ; +// // String value1 = Arrays.toString(value.toString()); +// System.out.println(key ) ;//+ " " + value); +//} + + System.out.println("NumberOfMicroClustersBeforePruning = "+ IDAndCent.size()); +// for (Long name: IDAndID.keySet()){ +// String key =name.toString(); +// String value = IDAndID.get(name).toString(); +// System.out.println(key + " " + value); +// +// +//} // next we want to prune the tree by parent count comparison // follows breadthfirst search @@ -249,6 +239,8 @@ public List> findDensityModes() { // System.out.println(sortedIDList.get(i) + ":"+VectorUtil.longToString(sortedIDList.get(i))+":"+IDAndCent.get(sortedIDList.get(i)).size()); // } +// System.out.println("NumberOfMicroClusters_AfterPruning = "+ denseSetOfIDandCount.size()); + System.out.println("NumberOfMicroClusters_AfterPruning = "+ estcents.size()); return new ArrayList<>(estcents.values()); @@ -271,7 +263,7 @@ public void run() { centroids.add(medoid(clustermembers.get(i))); } Agglomerative3 aggloOffline = new Agglomerative3(centroids, so.getk()); - System.out.println(centroids.size()); +// System.out.println(centroids.size()); aggloOffline.setWeights(weights); this.centroids = aggloOffline.getCentroids(); } @@ -279,17 +271,17 @@ public void run() { public static void main(String[] args) throws FileNotFoundException, IOException { - int k = 3; + int k = 6; int d = 100; - int n = 2000; - float var = 1.0f;//0.5f; + int n = 5000; + float var = 1.5f;//0.5f; int count = 1; // System.out.printf("ClusterVar\t"); // for (int i = 0; i < count; i++) // System.out.printf("Trial%d\t", i); // System.out.printf("RealWCSS\n"); - for (float f = var; f < 1.01; f += 1.5f) { + for (float f = var; f < 1.51; f += 1.5f) { float avgrealwcss = 0; float avgtime = 0; // System.out.printf("%f\t", f); @@ -298,7 +290,7 @@ public static void main(String[] args) throws FileNotFoundException, // gen.writeCSVToFile(new // File("/home/lee/Desktop/reclsh/in.csv")); RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDimparameter(4); + o.setDimparameter(8); RPHashAdaptive2Pass rphit = new RPHashAdaptive2Pass(o); long startTime = System.nanoTime(); List centsr = rphit.getCentroids(); @@ -312,11 +304,10 @@ public static void main(String[] args) throws FileNotFoundException, String Output = "/C:/Users/user/Desktop/temp/OutputTwrpCents" ; VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); - // System.out.printf("%.0f\t", - // StatTests.WCSSECentroidsFloat(centsr, gen.data)); - // System.gc(); + System.out.printf("%.0f\n\t",StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); } - // System.out.printf("%.0f\n", avgrealwcss / count); + System.out.printf("%.0f\n", avgrealwcss / count); diff --git a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java index 9f38946..5ecb470 100644 --- a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java +++ b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java @@ -29,7 +29,7 @@ public interface RPHashObject { final static int DEFAULT_NUM_PROJECTIONS = 1; public final static int DEFAULT_NUM_BLUR = 1; - final static long DEFAULT_NUM_RANDOM_SEED = 38006359550206753L; + final static long DEFAULT_NUM_RANDOM_SEED = 3800635955020675334L; final static int DEFAULT_NUM_DECODER_MULTIPLIER = 1; final static long DEFAULT_HASH_MODULUS = Long.MAX_VALUE; final static Decoder DEFAULT_INNER_DECODER = new Spherical(32,4,1);//new DepthProbingLSH(24);//new Leech();//new Spherical(16,2,2);//new MultiDecoder(24, new E8(1f));//new Golay();//new Spherical(64,2,1);//new Leech(3);//new PsdLSH();// diff --git a/src/main/java/edu/uc/rphash/TWRP1.java b/src/main/java/edu/uc/rphash/TWRP1.java index aaf3540..7a9f923 100644 --- a/src/main/java/edu/uc/rphash/TWRP1.java +++ b/src/main/java/edu/uc/rphash/TWRP1.java @@ -369,13 +369,7 @@ public List> findDensityModes() { return new ArrayList<>(estcents.values()); } - - - - - - - + @@ -605,9 +599,7 @@ public HashMap findDensityModes2() { } - - - + public void run() { @@ -682,16 +674,16 @@ public static void main(String[] args) throws FileNotFoundException, IOException { int k = 6;//6; - int d = 64;//16; - int n = 700; - float var = .5f; + int d = 100;//16; + int n = 5000; + float var = 1.5f; int count = 1; // System.out.printf("ClusterVar\t"); // for (int i = 0; i < count; i++) // System.out.printf("Trial%d\t", i); // System.out.printf("RealWCSS\n"); - for (float f = var; f < 1.01; f += 1.5f) { + for (float f = var; f < 1.51; f += 1.5f) { float avgrealwcss = 0; float avgtime = 0; // System.out.printf("%f\t", f); @@ -700,7 +692,7 @@ public static void main(String[] args) throws FileNotFoundException, // gen.writeCSVToFile(new // File("/home/lee/Desktop/reclsh/in.csv")); RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDimparameter(4); + o.setDimparameter(8); TWRP1 rphit = new TWRP1(o); long startTime = System.nanoTime(); List centsr = rphit.getCentroids(); @@ -710,9 +702,8 @@ public static void main(String[] args) throws FileNotFoundException, avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), gen.getData()); - // System.out.printf("%.0f\t", - // StatTests.WCSSECentroidsFloat(centsr, gen.data)); - // System.gc(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); } System.out.printf("%.0f\n", avgrealwcss / count); diff --git a/src/main/java/edu/uc/rphash/TWRPv2.java b/src/main/java/edu/uc/rphash/TWRPv2.java index 08b05a2..d24a616 100644 --- a/src/main/java/edu/uc/rphash/TWRPv2.java +++ b/src/main/java/edu/uc/rphash/TWRPv2.java @@ -4,15 +4,15 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; +//import java.util.Arrays; import java.util.HashMap; -import java.util.Iterator; +//import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import java.util.Random; import java.util.TreeSet; import java.util.stream.Stream; -import java.util.Map; +//import java.util.Map; import edu.uc.rphash.Readers.RPHashObject; @@ -24,17 +24,14 @@ import edu.uc.rphash.util.VectorUtil; - public class TWRPv2 implements Clusterer, Runnable { boolean znorm = false; - private int counter; private float[] rngvec; private List centroids = null; - private RPHashObject so; public TWRPv2(RPHashObject so) { @@ -89,43 +86,39 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, ret[1] = x_r; return ret; } - - - - - + //float[] rngvec; the range vector is moot if incoming data has been normalized //post normalization it should all be zero centered, with variance 1 - + /* * super simple hash algorithm, reminiscient of pstable lsh */ // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. - public long hashvec(float[] xt, float[] x, - HashMap> IDAndCent, HashMap> IDAndLabel,int ct) { - long s = 1; //fixes leading 0's bug - for (int i = 0; i < xt.length; i++) { -// s <<= 1; - s = s << 1 ; // left shift the bits of s by 1. - if (xt[i] > rngvec[i]) -// s += 1; - s= s+1; - - if (IDAndCent.containsKey(s)) { - IDAndLabel.get(s).add(ct); - IDAndCent.get(s).add(x); - } else { - List xlist = new ArrayList<>(); - xlist.add(x); - IDAndCent.put(s, xlist); - List idlist = new ArrayList<>(); - idlist.add(ct); - IDAndLabel.put(s, idlist); - } - } - return s; - } +// public long hashvec(float[] xt, float[] x, +// HashMap> IDAndCent, HashMap> IDAndLabel,int ct) { +// long s = 1; //fixes leading 0's bug +// for (int i = 0; i < xt.length; i++) { +//// s <<= 1; +// s = s << 1 ; // left shift the bits of s by 1. +// if (xt[i] > rngvec[i]) +//// s += 1; +// s= s+1; +// +// if (IDAndCent.containsKey(s)) { +// IDAndLabel.get(s).add(ct); +// IDAndCent.get(s).add(x); +// } else { +// List xlist = new ArrayList<>(); +// xlist.add(x); +// IDAndCent.put(s, xlist); +// List idlist = new ArrayList<>(); +// idlist.add(ct); +// IDAndLabel.put(s, idlist); +// } +// } +// return s; +// } public long hashvec2(float[] xt, float[] x, HashMap MapOfIDAndCent, HashMap MapOfIDAndCount,int ct) { @@ -136,9 +129,7 @@ public long hashvec2(float[] xt, float[] x, if (xt[i] > rngvec[i]) // s += 1; s= s+1; - - - + if (MapOfIDAndCent.containsKey(s)) { float CurrentCount = MapOfIDAndCount.get(s); @@ -157,8 +148,7 @@ public long hashvec2(float[] xt, float[] x, MapOfIDAndCent.put(s, MergedVector); } - - + else { float[] xlist = x; @@ -178,7 +168,7 @@ public long hashvec2(float[] xt, float[] x, * maps */ void addtocounter(float[] x, Projector p, - HashMap> IDAndCent,HashMap> IDandID,int ct) { + HashMap IDAndCent,HashMap IDandID,int ct) { float[] xt = p.project(x); // counter++; @@ -187,11 +177,11 @@ void addtocounter(float[] x, Projector p, // rngvec[i] += delta/(float)counter; // } - hashvec(xt,x,IDAndCent, IDandID,ct); + hashvec2(xt,x,IDAndCent, IDandID,ct); } void addtocounter(float[] x, Projector p, - HashMap> IDAndCent,HashMap> IDandID,int ct,float[] mean,float[] variance) + HashMap IDAndCent,HashMap IDandID,int ct,float[] mean,float[] variance) { float[] xt = p.project(StatTests.znormvec(x, mean, variance)); @@ -201,7 +191,7 @@ void addtocounter(float[] x, Projector p, // rngvec[i] += delta/(float)counter; // } - hashvec(xt,x,IDAndCent, IDandID,ct); + hashvec2(xt,x,IDAndCent, IDandID,ct); } static boolean isPowerOfTwo(long num) { @@ -214,10 +204,9 @@ static boolean isPowerOfTwo(long num) { */ - public HashMap findDensityModes2() { - HashMap> IDAndCent = new HashMap<>(); - HashMap> IDAndID = new HashMap<>(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); // #create projector matrixs Projector projector = so.getProjectionType(); projector.setOrigDim(so.getdim()); @@ -232,7 +221,7 @@ public HashMap findDensityModes2() { // // #process data by adding to the counter // for (float[] x : so.getRawData()) // { -// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); +// addtocounter(x, projector, MapOfIDAndCent,MapOfIDAndCount,ct++,mean,variance); // } // } // @@ -241,82 +230,34 @@ public HashMap findDensityModes2() { for (float[] x : so.getRawData()) { - addtocounter(x, projector, IDAndCent, IDAndID,ct++); + addtocounter(x, projector, MapOfIDAndCent, MapOfIDAndCount,ct++); } } - for (Long name: IDAndCent.keySet()){ +// for (Long name: MapOfIDAndCent.keySet()){ - String key =name.toString(); - System.out.println(key ); +// String key =name.toString(); +// System.out.println(key); // String value = IDAndCent.get(name).toString() ; -// String value1 = Arrays.toString(value.toString()); - +// String value1 = Arrays.toString(value.toString()); // System.out.println(key + " " + value); - -} +//} - for (Long name: IDAndID.keySet()){ +// for (Long name: MapOfIDAndCount.keySet()){ // String key =name.toString(); // String value = IDAndID.get(name).toString(); // System.out.println(key + " " + value); - -} - - // we would compress the hashmaps. SetOfIDandCount has the ids and the counts corresponding to that id. - // we have two hashmaps: 1. IDAndCent and 2. IDAndID. we will use IDAndCent +//} - - HashMap MapOfIDAndCount = new HashMap(); - - HashMap MapOfIDAndCent = new HashMap(); - - for (Long cur_id : new TreeSet(IDAndCent.keySet())) - { - int cur_count = IDAndCent.get(cur_id).size(); - - MapOfIDAndCount.put(cur_id, (long) cur_count); // this has the hashids and counts. - - List bucketpoints = new ArrayList<>(); - - Iterator e = IDAndCent.get(cur_id).iterator(); - -// int i=1; - while (e.hasNext()) { - -// System.out.println(i++); - - bucketpoints.add(e.next()) ; - - } - - float [] bucketcent; - - bucketcent = medoid(bucketpoints); - - MapOfIDAndCent.put(cur_id, bucketcent); // this has the hashids and centroids. - -// System.out.println(cur_id + " " + cur_count); - - // int c = MapOfIDAndCent.get(cur_id).length; - - // System.out.println(cur_id + " " + c); - - - } - -// int NumberOfMicroClustersBeforePruning = MapOfIDAndCent.size() ; -// System.out.println("NumberOfMicroClustersBeforePruning = "+ NumberOfMicroClustersBeforePruning); + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); // next we want to prune the tree by parent count comparison // follows breadthfirst search - - HashMap denseSetOfIDandCount2 = new HashMap(); for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) @@ -358,25 +299,17 @@ public HashMap findDensityModes2() { } - - //remove keys with support less than 1 - Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); //64 so 6 bits? //stream = stream.filter(p -> p.getKey() > 64); - - List sortedIDList2= new ArrayList<>(); // sort and limit the list - stream2.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + stream2.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*6) .forEachOrdered(x -> sortedIDList2.add(x.getKey())); - - - HashMap KeyAndCent = new HashMap<>(); HashMap KeyAndCount = new HashMap<>(); HashMap WeightAndCent = new HashMap<>(); @@ -389,12 +322,9 @@ public HashMap findDensityModes2() { WeightAndCent.put(MapOfIDAndCount.get(sortedIDList2.get(i)), MapOfIDAndCent.get(sortedIDList2.get(i))); } - - - + return WeightAndCent; - } @@ -413,8 +343,7 @@ public void run() { List weights2 =new ArrayList<>(); - int NumberOfMicroClusters = WeightAndClusters.size() ; - System.out.println("NumberOfMicroClusters = "+ NumberOfMicroClusters); + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); // int k = NumberOfMicroClusters>200+so.getk()?200+so.getk():NumberOfMicroClusters; @@ -426,8 +355,7 @@ public void run() { weights2.add((float)weights); centroids2.add(WeightAndClusters.get(weights)); } - - + //System.out.printf("\tvalueofK is "); //System.out.println( so.getk()); @@ -437,20 +365,16 @@ public void run() { aggloOffline.setWeights(weights2); this.centroids = aggloOffline.getCentroids(); - - + } - - - public static void main(String[] args) throws FileNotFoundException, IOException { - int k = 5;//6; - int d = 100;//16; - int n = 5000; + int k = 20;//6; + int d = 500;//16; + int n = 500000; float var = 1.5f; int count = 1; // System.out.printf("ClusterVar\t"); @@ -458,17 +382,21 @@ public static void main(String[] args) throws FileNotFoundException, // System.out.printf("Trial%d\t", i); // System.out.printf("RealWCSS\n"); - String Output = "/C:/Users/user/Desktop/temp/OutputTwrpCents" ; + String Output = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" ; float f = var; float avgrealwcss = 0; float avgtime = 0; // System.out.printf("%f\t", f); GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); - // gen.writeCSVToFile(new - // File("/home/lee/Desktop/reclsh/in.csv")); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDimparameter(8); + + o.setDimparameter(28); TWRPv2 rphit = new TWRPv2(o); long startTime = System.nanoTime(); @@ -481,11 +409,10 @@ public static void main(String[] args) throws FileNotFoundException, VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); - // System.out.printf("%.0f\t", - // StatTests.WCSSECentroidsFloat(centsr, gen.data)); - // System.gc(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); - // System.out.printf("%.0f\n", avgrealwcss / count); + System.out.printf("%.0f\n", avgrealwcss / count); } diff --git a/src/main/java/edu/uc/rphash/projections/DBFriendlyProjection.java b/src/main/java/edu/uc/rphash/projections/DBFriendlyProjection.java index 7f676b8..ab1c7b7 100644 --- a/src/main/java/edu/uc/rphash/projections/DBFriendlyProjection.java +++ b/src/main/java/edu/uc/rphash/projections/DBFriendlyProjection.java @@ -108,6 +108,7 @@ public float[] project(float[] v) { // -sqrt(3/t) // n: original dimension // t: target OR projected dimension + static float[] projectN(float[] v, int[][] P, int[][] M, int t) { float[] r = new float[t]; float sum; @@ -117,10 +118,10 @@ static float[] projectN(float[] v, int[][] P, int[][] M, int t) { //------------------------------------------------- // this is what is there in spark code: - for (int col : M[i]) - sum -= v[col]; - for (int col : P[i]) - sum += v[col]; +// for (int col : M[i]) +// sum -= v[col]; +// for (int col : P[i]) +// sum += v[col]; //--------------------------------------------------- @@ -129,13 +130,10 @@ static float[] projectN(float[] v, int[][] P, int[][] M, int t) { for(int j=0;j Date: Thu, 23 Aug 2018 11:01:45 -0400 Subject: [PATCH 03/29] changed MAP to MultiMap, added cmd line parameters. Changed the MAP data structure to MultiMap so that one key can map to multiple values. Added command line parameters to set cutoff and choose zero/random vector. Fixed error for the Rphash Simple, changed the final centroids returned to correct values. --- src/main/java/edu/uc/rphash/RPHash.java | 31 ++++ .../edu/uc/rphash/RPHashAdaptive2Pass.java | 17 +- src/main/java/edu/uc/rphash/RPHashSimple.java | 37 +++- .../edu/uc/rphash/Readers/RPHashObject.java | 9 +- .../uc/rphash/Readers/SimpleArrayReader.java | 32 ++++ .../edu/uc/rphash/Readers/StreamObject.java | 24 +++ src/main/java/edu/uc/rphash/TWRPv2.java | 169 +++++++++++++----- 7 files changed, 266 insertions(+), 53 deletions(-) diff --git a/src/main/java/edu/uc/rphash/RPHash.java b/src/main/java/edu/uc/rphash/RPHash.java index 9038026..2098fa5 100644 --- a/src/main/java/edu/uc/rphash/RPHash.java +++ b/src/main/java/edu/uc/rphash/RPHash.java @@ -52,16 +52,24 @@ public class RPHash { static String[] clusteringmethods = { "simple", "streaming", "multiproj", "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" ,"twrp"}; + static String[] offlineclusteringmethods = { "singlelink", "completelink", "averagelink", "kmeans", "adaptivemeanshift", "kmpp", "multikmpp" , "dbscan", "none" }; + static String[] projectionmethods = { "dbf", "fjlt", "rp", "svd", "noproj" }; + static String[] ops = { "numprojections", "innerdecodermultiplier", "numblur", "randomseed", "hashmod", "parallel", "streamduration", "raw", "decayrate", "dimparameter", "decodertype", "offlineclusterer", "runs", "normalize", "projection" }; + static String[] decoders = { "dn", "e8", "golay", "multie8", "leech", "multileech", "sphere", "levypstable", "cauchypstable", "gaussianpstable", "adaptive", "origin" }; + + static String[] twrp_options = { "cutoff", "randomvector" }; + + public static void main(String[] args) throws NumberFormatException, IOException, InterruptedException { @@ -95,6 +103,12 @@ public static void main(String[] args) throws NumberFormatException, System.out.print(s + " ,"); System.out.print("]\n"); + System.out.print("\t twrp_options" + "\t:["); + for (String s : twrp_options) + System.out.print(s + " ,"); + System.out.print("]\n"); + + System.exit(0); } @@ -114,6 +128,9 @@ public static void main(String[] args) throws NumberFormatException, matched |= keyword.equals(match); for (String match : decoders) matched |= keyword.equals(match); + for (String match : twrp_options) + matched |= keyword.equals(match); + if (!matched) unmatchedkeywords.add(keyword); } @@ -503,6 +520,20 @@ public static List runConfigs(List untaggedArgs, o.setNormalize(Boolean.parseBoolean(taggedArgs.get("normalize"))); so.setNormalize(Boolean.parseBoolean(taggedArgs.get("normalize"))); } + + + if (taggedArgs.containsKey("cutoff")) { + o.setCutoff(Integer.parseInt(taggedArgs.get("cutoff"))); + so.setCutoff(Integer.parseInt(taggedArgs.get("cutoff"))); + } + + + if (taggedArgs.containsKey("randomvector")) { + o.setRandomVector(Boolean.parseBoolean(taggedArgs.get("randomvector"))); + so.setRandomVector(Boolean.parseBoolean(taggedArgs.get("randomvector"))); + } + + if (taggedArgs.containsKey("projection")) { switch (taggedArgs.get("projection")) { diff --git a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java index 88a9637..927bf7f 100644 --- a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java +++ b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java @@ -223,9 +223,11 @@ public List> findDensityModes() { List sortedIDList= new ArrayList<>(); // sort and limit the list - stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*6) .forEachOrdered(x -> sortedIDList.add(x.getKey())); + System.out.println("NumberOfMicroClustersAfterPruning = "+ sortedIDList.size()); + // compute centroids HashMap> estcents = new HashMap<>(); @@ -233,6 +235,8 @@ public List> findDensityModes() { { estcents.put(sortedIDList.get(i), IDAndCent.get(sortedIDList.get(i))); } + + // System.out.println(); // for (int i =0; icentroids = new ArrayList<>(); List weights =new ArrayList<>(); - int k = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); + // int k = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); + int k = clustermembers.size(); for(int i=0;i centsr = rphit.getCentroids(); diff --git a/src/main/java/edu/uc/rphash/RPHashSimple.java b/src/main/java/edu/uc/rphash/RPHashSimple.java index 2e8cba9..83f7bc2 100644 --- a/src/main/java/edu/uc/rphash/RPHashSimple.java +++ b/src/main/java/edu/uc/rphash/RPHashSimple.java @@ -24,6 +24,7 @@ import edu.uc.rphash.standardhash.NoHash; import edu.uc.rphash.tests.StatTests; import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.MultiKMPP; import edu.uc.rphash.tests.generators.GenerateData; import edu.uc.rphash.tests.generators.GenerateStreamData; import edu.uc.rphash.tests.kmeanspp.KMeansPlusPlus; @@ -54,8 +55,8 @@ public RPHashObject map() { int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and // round to // integer - int k = so.getk() * logk; - is = new SimpleFrequentItemSet(k); + int k1 = so.getk() * logk; + is = new SimpleFrequentItemSet(k1); Decoder dec = so.getDecoderType(); dec.setCounter(is); @@ -205,13 +206,27 @@ public void accept(float[] t) { // // } + Clusterer offlineclusterer = so.getOfflineClusterer(); offlineclusterer.setData(centroids); offlineclusterer.setWeights(so.getCounts()); offlineclusterer.setK(so.getk()); + + // System.out.println("\n k sent to offline = "+ so.getk()); + this.centroids = offlineclusterer.getCentroids(); + + //System.out.println("\n cents in reduce from offline cluster = "+ this.centroids.size()); + + //System.out.println("\n cents in reduce after label mapping = "+ centroids.size()); + this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids); - so.setCentroids(centroids); + + //so.setCentroids(centroids); + so.setCentroids(this.centroids); + + + return so; } @@ -272,12 +287,14 @@ private void run() { map(); reduce(); this.centroids = so.getCentroids(); + + } public static void main(String[] args) { int k = 10; - int d = 1000; - int n = 10000; + int d = 200; + int n = 1000; float var = 1f; int count = 5; System.out.printf("Decoder: %s\n", "Sphere"); @@ -296,9 +313,17 @@ public static void main(String[] args) { RPHashSimple rphit = new RPHashSimple(o); o.setDecoderType(new Spherical(32, 4, 1)); // o.setDimparameter(31); - o.setOfflineClusterer(new KMeans2()); + //o.setOfflineClusterer(new KMeans2()); + o.setOfflineClusterer(new MultiKMPP()); + + //System.out.println("\n k sent to offline in MAIN = "+ o.getk()); + long startTime = System.nanoTime(); List centsr = rphit.getCentroids(); + + //System.out.println("\n no of final cents : " + centsr.size()); + + avgtime += (System.nanoTime() - startTime) / 100000000; // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), diff --git a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java index 5ecb470..2d5b0a5 100644 --- a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java +++ b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java @@ -90,6 +90,13 @@ public interface RPHashObject { void setDimparameter(int parseInt); int getDimparameter(); + void setCutoff(int parseInt); + int getCutoff(); + + void setRandomVector(boolean parseBoolean); + boolean getRandomVector(); + + // void setOfflineClusterer(Clusterer agglomerative3); // Clusterer getOfflineClusterer(); @@ -99,7 +106,7 @@ public interface RPHashObject { int getk(); - void setK(int getk); + void setK(int k); String toString(); void reset();//TODO rename to resetDataStream diff --git a/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java b/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java index eaf3c2f..8fe1916 100644 --- a/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java +++ b/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java @@ -33,6 +33,11 @@ public class SimpleArrayReader implements RPHashObject { private Clusterer clusterer; private boolean normalize = false; private Projector projector; + + boolean RandomVector = false; + int Cutoff; + + public void setRandomSeed(long randomSeed) { this.randomSeed = randomSeed; @@ -397,4 +402,31 @@ public void setProjectionType(Projector dbFriendlyProjection) { public Projector getProjectionType(){ return this.projector; } + + + + @Override + public void setCutoff(int parseInt) { + this.Cutoff = parseInt; + + } + + @Override + public int getCutoff() { + + return this.Cutoff; + } + + + + @Override + public void setRandomVector(boolean parseBoolean) { + this.RandomVector = parseBoolean; + } + public boolean getRandomVector() { + return this.RandomVector; + } + + + } diff --git a/src/main/java/edu/uc/rphash/Readers/StreamObject.java b/src/main/java/edu/uc/rphash/Readers/StreamObject.java index 460070f..428a891 100644 --- a/src/main/java/edu/uc/rphash/Readers/StreamObject.java +++ b/src/main/java/edu/uc/rphash/Readers/StreamObject.java @@ -41,6 +41,9 @@ public class StreamObject implements RPHashObject, Iterator { Decoder dec; float decayrate=0; boolean parallel = true; + boolean RandomVector; + int Cutoff; + ExecutorService executor; InputStream inputStream; @@ -425,4 +428,25 @@ public void setProjectionType(Projector dbFriendlyProjection) { public Projector getProjectionType() { return this.projector; } + + + + @Override + public void setCutoff(int parseInt) { + this.Cutoff = parseInt; + } + @Override + public int getCutoff() { + return this.Cutoff; + } + + + @Override + public void setRandomVector(boolean parseBoolean) { + this.RandomVector = parseBoolean; + } + public boolean getRandomVector() { + return this.RandomVector; + } + } diff --git a/src/main/java/edu/uc/rphash/TWRPv2.java b/src/main/java/edu/uc/rphash/TWRPv2.java index d24a616..d98e752 100644 --- a/src/main/java/edu/uc/rphash/TWRPv2.java +++ b/src/main/java/edu/uc/rphash/TWRPv2.java @@ -7,13 +7,13 @@ //import java.util.Arrays; import java.util.HashMap; //import java.util.Iterator; +//import java.util.LinkedHashMap; import java.util.List; +//import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.TreeSet; import java.util.stream.Stream; -//import java.util.Map; - import edu.uc.rphash.Readers.RPHashObject; import edu.uc.rphash.Readers.SimpleArrayReader; @@ -23,6 +23,12 @@ import edu.uc.rphash.tests.generators.GenerateData; import edu.uc.rphash.util.VectorUtil; +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + public class TWRPv2 implements Clusterer, Runnable { @@ -50,7 +56,6 @@ public List getCentroids() { return centroids; } - /* * X - set of vectors compute the medoid of a vector set */ @@ -88,8 +93,7 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, } //float[] rngvec; the range vector is moot if incoming data has been normalized - //post normalization it should all be zero centered, with variance 1 - + //post normalization it should all be zero centered, with variance 1 /* * super simple hash algorithm, reminiscient of pstable lsh */ @@ -127,7 +131,6 @@ public long hashvec2(float[] xt, float[] x, // s <<= 1; s = s << 1 ; // left shift the bits of s by 1. if (xt[i] > rngvec[i]) -// s += 1; s= s+1; if (MapOfIDAndCent.containsKey(s)) { @@ -203,8 +206,8 @@ static boolean isPowerOfTwo(long num) { * density mode via iterative deepening hash counting */ - - public HashMap findDensityModes2() { + public Multimap findDensityModes2() { + //public Map findDensityModes2() { HashMap MapOfIDAndCent = new HashMap<>(); HashMap MapOfIDAndCount = new HashMap<>(); // #create projector matrixs @@ -213,6 +216,7 @@ public HashMap findDensityModes2() { projector.setProjectedDim(so.getDimparameter()); projector.setRandomSeed(so.getRandomSeed()); projector.init(); + int cutoff = so.getCutoff(); int ct = 0; // if(znorm == true){ @@ -299,31 +303,68 @@ public HashMap findDensityModes2() { } - //remove keys with support less than 1 + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + //remove keys with support less than 1 Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); //64 so 6 bits? //stream = stream.filter(p -> p.getKey() > 64); - + +// Stream> stream3 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); +// long counter= stream3.count(); +// System.out.println("NumberOfMicroClustersAfterPruning&limit_the_1s = "+ counter); + +// int cutoff= so.getk()*8; +// if (so.getk()*6 < 210) { cutoff=210+so.getk();} else { cutoff = so.getk()*8;} +// int cutoff = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); +// System.out.println("Cutoff = "+ cutoff); + List sortedIDList2= new ArrayList<>(); // sort and limit the list - stream2.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*6) + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) .forEachOrdered(x -> sortedIDList2.add(x.getKey())); - - HashMap KeyAndCent = new HashMap<>(); - HashMap KeyAndCount = new HashMap<>(); - HashMap WeightAndCent = new HashMap<>(); - - for (int i =0; i KeyAndCent = new HashMap<>(); +// HashMap KeyAndCount = new HashMap<>(); +// Map WeightAndCent = new HashMap<>(); +// Map WeightAndCent = new LinkedHashMap<>(); + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + +// for (int i =0; i X) { return ret; } - +// this updates the map two cents with different weigths are merged into one. public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float cnt_2, float[] x_2) { diff --git a/src/main/java/edu/uc/rphash/decoders/SphericalRandom.java b/src/main/java/edu/uc/rphash/decoders/SphericalRandom.java new file mode 100644 index 0000000..32c4250 --- /dev/null +++ b/src/main/java/edu/uc/rphash/decoders/SphericalRandom.java @@ -0,0 +1,292 @@ +package edu.uc.rphash.decoders; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; + +import edu.uc.rphash.frequentItemSet.Countable; +import edu.uc.rphash.standardhash.MurmurHash; +import edu.uc.rphash.util.VectorUtil; + +/** + * Spherical LSH Decoder based on SLSH (lgpl) + * + * @author lee + * + */ +public class SphericalRandom implements Decoder { + int HashBits = 32; + final List> vAll; // vAll[i][j] is the vector $A_i \tilde v_j$ + // from + // the article. + int hbits; // Ceil(Log2(2*d)). + int d; // the dimension of the feature space. + int k; // number of elementary hash functions (h) to be concatenated to + // obtain a reliable enough hash function (g). LSH queries becomes + // more selective with increasing k, due to the reduced the + // probability of collision. + int l; // number of "copies" of the bins (with a different random matrices). + // Increasing L will increase the number of points the should be + // scanned linearly during query. + float distance = 0; + + /** + * This class represent a spherical lsh scheme. Vectors are decoded to the + * nearest vertex of the d dimensional orthoplex reresented by a canonical + * ordered integer. + * + * @param d + * - the number of dimension in the orthoplex + * @param k + * - number of rotations of the fundamental hash functions + * @param L + * - the number to search, currently ignored in RPHash + */ + public SphericalRandom(int d, int k, int L) { + this.d = d;// number of dimensions + this.k = k;// number of elementary hash functions + this.l = L;// L;//number of copies to search + double nvertex = 2.0 * this.d; + this.hbits = (int) Math.ceil(Math.log(nvertex) / Math.log(2)); + int kmax = (int) (HashBits / this.hbits); + if (this.k > kmax) { + this.k = kmax; + System.out + .printf("k is too big, chopping down (%d->%d)\n", k, kmax); + } + + Random[] r = new Random[d]; + for (int i = 0; i < d; i++) + r[i] = new Random(); + + // For orthoplex, the basis Vectortors v_i are permutations of the + // Vectortor (1, 0, ..., 0), + // and -(1, 0, ..., 0). + // Thus R v_i simply picks up the ith row of the rotation matrix, up to + // a sign. + // This means we don't need any matrix multiplication; R matrix is the + // list of + // rotated vectors itself! + this.vAll = new ArrayList>(k * l); // random rotation + // matrices + for (int i = 0; i < k * l; i++) { + this.vAll.add(i, randomRotation(this.d, r)); + } + } + + @Override + public int getDimensionality() { + return d; + } + + @Override + public long[] decode(float[] f) { + return Hash(f); + } + + @Override + public float getErrorRadius() { + return d; + } + + @Override + public float getDistance() { + return distance; + } + + long argmaxi(float[] p, int index) { + List vs = vAll.get(index); + long maxi = 0; + float max = 0; + for (int i = 0; i < this.d; i++) { + + float dot = dot(p, vs.get(i)); + // compute orthoplex of -1 and 1 simultaneously + + + //float dot = dotshift(p, vs.get(i)); // aas we are using dotshift the full matrix needs storing. incorporate that. + + float abs = dot >= 0 ? dot : -dot; + if (abs < max) { + continue; + } + max = abs; + maxi = dot >= 0 ? i : i + this.d; + } + return maxi; + } + + float norm(float[] t) { + float n = 0; + for (int i = 0; i < t.length; i++) { + n += t[i] * t[i]; + } + return (float) Math.sqrt(n); + } + + float[] scale(float[] t, float s) { + for (int i = 0; i < t.length; i++) { + t[i] *= s; + } + return t; + } + + float dot(float[] t, float[] u) { + float s = 0; + for (int i = 0; i < t.length; i++) { + s += t[i] * u[i]; + } + return s; + } + + + + float dotshift(float[] t, float[] u) { + float s = 0; + for (int i = 0; i < t.length; i++) { + + + s = (float) ( s + ((t[i]*0.2)+0.1) * u[i] ); + } + return s; + } + + + + float[] sub(float[] t, float[] u) { + for (int i = 0; i < t.length; i++) { + t[i] -= u[i]; + } + return t; + } + + float[] random(int d, Random[] r) { + + float[] v = new float[d]; + + for (int i = 0; i < d; i++) { + v[i] = (float) r[i].nextGaussian(); + } + return v; + } + + List randomRotation(int d, Random[] r2) { + ArrayList R = new ArrayList<>(d); + for (int i = 0; i < d; i++) { + R.add(i, random(d, r2)); + float[] u = R.get(i); + for (int j = 0; j < i; j++) { + float[] v = R.get(j); + float vnorm = norm(v); + if (vnorm == 0) { + return randomRotation(d, r2); + } + float[] vs = new float[v.length]; + System.arraycopy(v, 0, vs, 0, v.length); + scale(vs, dot(v, u) / vnorm); + u = sub(u, vs); + } + u = scale(u, 1.0f / norm(u)); + } + return R; + } + + // Hashes a single point slsh.l times, using a different set of + // random matrices created and stored by the constructor for each. + // Stores the result in g to avoid unnecessary allocations. + // + // SLSH requires that all vectors lie on a d-dimensional hypershpere, + // thus having the same norm. Only the Similarity method of FeatureVector + // is required to take the normalization into account. + // + // The complexity of this function is O(nLK) + long[] Hash(float[] p) { + int ri = 0; + long[] h = new long[l]; + float normp = norm(p); + p = scale(p, 1.0f / normp); + for (int i = 0; i < this.l; i++) { + for (int j = 0; j < this.k; j++) { + h[i] = h[i] | this.argmaxi(p, ri); + h[i] <<= this.hbits; + ri++; + } + } + + return h;//+ (int) (normp); + + } + + public static void main(String[] args) { + Random r = new Random(); + int d = 16; + int K = 3; + int L = 1; + Spherical sp = new Spherical(d, K, L); + + // MultiDecoder sp = new MultiDecoder( d, e8); + MurmurHash hash = new MurmurHash(Integer.MAX_VALUE); + float testResolution = 10000f; + + HashMap ctmap = new HashMap(); + + for (int i = 0; i < 400; i++) { + int ct = 0; + float distavg = 0.0f; + for (int j = 0; j < testResolution; j++) { + float p1[] = new float[d]; + float p2[] = new float[d]; + + // generate a vector + for (int k = 0; k < d; k++) { + p1[k] = r.nextFloat() * 2 - 1f; + p2[k] = (float) (p1[k] + r.nextGaussian() + * ((float) i / 1000f)); + } + float dist = VectorUtil.distance(p1, p2); + distavg += dist; + long[] l1 = sp.decode(p1); + long[] l2 = sp.decode(p2); + + ctmap.put(l1[0], + ctmap.containsKey(l1[0]) ? 1 + ctmap.get(l1[0]) : 1); + + long hp1 = hash.hash(l1); + long hp2 = hash.hash(l2); + + // ctmap.put(hp1,ctmap.containsKey(hp1)?1+ctmap.get(hp1):1); + + ct += (hp2 == hp1) ? 1 : 0; + + } + + System.out.println(distavg / testResolution + "\t" + (float) ct + / testResolution); + } + } + + float[] variance; + +// @Override +// public void setVariance(float[] parameterObject) { +// variance = parameterObject; +// } +// +// @Override +// public float[] getVariance(){ +// return variance; +// } + + @Override + public boolean selfScaling() { + return true; + } + + @Override + public void setCounter(Countable counter) { + // TODO Auto-generated method stub + + } + +} From 3a1cbce174b438f58d0d3fd943cd9c7df80241f5 Mon Sep 17 00:00:00 2001 From: deysn Date: Thu, 15 Aug 2019 14:00:40 -0400 Subject: [PATCH 05/29] TWRPv3 creates 3 trees with diff. random bisection pts and merges them --- .classpath | 2 +- src/main/java/edu/uc/rphash/TWRPv2.java | 2 +- src/main/java/edu/uc/rphash/TWRPv3.java | 607 ++++++++++++++++++++++++ 3 files changed, 609 insertions(+), 2 deletions(-) create mode 100644 src/main/java/edu/uc/rphash/TWRPv3.java diff --git a/.classpath b/.classpath index 8d805ce..0e623b8 100644 --- a/.classpath +++ b/.classpath @@ -4,6 +4,6 @@ - + diff --git a/src/main/java/edu/uc/rphash/TWRPv2.java b/src/main/java/edu/uc/rphash/TWRPv2.java index 3353396..7a025b0 100644 --- a/src/main/java/edu/uc/rphash/TWRPv2.java +++ b/src/main/java/edu/uc/rphash/TWRPv2.java @@ -473,7 +473,7 @@ public static void main(String[] args) throws FileNotFoundException, o.setRandomVector(true); // System.out.println("cutoff = "+ o.getCutoff()); -// System.out.println("get_random_Vector = "+ o.getRandomVector()); + System.out.println("get_random_Vector = "+ o.getRandomVector()); TWRPv2 rphit = new TWRPv2(o); long startTime = System.nanoTime(); diff --git a/src/main/java/edu/uc/rphash/TWRPv3.java b/src/main/java/edu/uc/rphash/TWRPv3.java new file mode 100644 index 0000000..ff221d4 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv3.java @@ -0,0 +1,607 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv3 implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv3(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + + // combines two hashmaps of idsandcents + + public static HashMap mergehmapsidsandcents( + HashMap partidandcent1, + HashMap partidandcent2, + HashMap partidandcount1, + HashMap partidandcount2) +{ + // new empty map + HashMap combined = new HashMap<>(); + combined.putAll( partidandcent1); + + for(Long key : partidandcent2.keySet()) { + if(combined.containsKey(key)) { + + + Long weight1= partidandcount1.get(key); + + float[] cent1= combined.get(key); + + Long weight2= partidandcount2.get(key); + + float [] cent2= partidandcent2.get(key); + + float [][] joined = UpdateHashMap(weight1, cent1 ,weight2 , cent2 ); + float combinedCount = joined[0][0]; + float [] combinedCent = joined[1]; + + + combined.put(key,combinedCent); + + } + else { + combined.put(key,partidandcent2.get(key)); + } + } + + return (combined); + +} + + + + // combines two hashmaps of idsandcounts + + public static HashMap mergehmapsidsandcounts(HashMap partidandcount1, + HashMap partidandcountt2) + { + + + HashMap combined = new HashMap (); // new empty map + combined.putAll(partidandcount1); + + + + for(Long key : partidandcountt2.keySet()) { + if(combined.containsKey(key)) { + + + long value1 = combined.get(key); + long value2 = partidandcountt2.get(key); + long value3 = value1 + value2; + + combined.put(key,value3); + + + } + else { + combined.put(key,partidandcountt2.get(key)); + } + + + + } + return (combined); + } + + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float[][] ret = new float[2][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount,int ct, float[] rngvec) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, CountForIncomingVector, IncomingVector ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID,ct,rngvec ); + } + + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct,float[] mean,float[] variance, float[] rngvec ) + { + float[] xt = p.project(StatTests.znormvec(x, mean, variance)); + + + hashvec2(xt,x,IDAndCent, IDandID,ct, rngvec); + } + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec3); + + + } + } + + + MapOfIDAndCount=mergehmapsidsandcounts(MapOfIDAndCount1, MapOfIDAndCount2); + + MapOfIDAndCent = mergehmapsidsandcents(MapOfIDAndCent1,MapOfIDAndCent2,MapOfIDAndCount1, MapOfIDAndCount2 ); + + MapOfIDAndCent = mergehmapsidsandcents(MapOfIDAndCent,MapOfIDAndCent3,MapOfIDAndCount, MapOfIDAndCount3 ); + + MapOfIDAndCount=mergehmapsidsandcounts(MapOfIDAndCount, MapOfIDAndCount3); + + + + + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + +//HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + +// Stream> stream3 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); +// long counter= stream3.count(); +// System.out.println("NumberOfMicroClustersAfterPruning&limit_the_1s = "+ counter); + +// int cutoff= so.getk()*8; +// if (so.getk()*6 < 210) { cutoff=210+so.getk();} else { cutoff = so.getk()*8;} +// int cutoff = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); +// System.out.println("Cutoff = "+ cutoff); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + +// HashMap KeyAndCent = new HashMap<>(); +// HashMap KeyAndCount = new HashMap<>(); +// Map WeightAndCent = new HashMap<>(); +// Map WeightAndCent = new LinkedHashMap<>(); + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + +// + for (Long keys: sortedIDList2) + + { +// WeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + +// KeyAndCent.put(keys, MapOfIDAndCent.get(keys)); +// KeyAndCount.put(keys, MapOfIDAndCount.get(keys)); + + } + + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + + + Multimap WeightAndClusters = findDensityModes2(); + //Map WeightAndClusters = findDensityModes2(); + + + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + // int k = NumberOfMicroClusters>200+so.getk()?200+so.getk():NumberOfMicroClusters; + + // have to prune depending NumberOfMicroClusters returned. + // int i = 1; + // int j=1; + // for (Long weights : new TreeSet(WeightAndClusters.keySet())) + for (Long weights : WeightAndClusters.keys()) + { + // System.out.println("NumberOfTreesetkeys = "+ i); + // String key =weights.toString(); + // System.out.println(weights); + weights2.add((float)weights); + // centroids2.add(WeightAndClusters.get(weights)); + // centroids2.addAll(WeightAndClusters.get(weights)); + // i=i+1; + } + // System.out.println("done printing keys for weights"); + + for (Long weight : WeightAndClusters.keySet()) + + { + // System.out.println(weight); + // System.out.println("NumberOfTreesetkeys = "+ j); + centroids2.addAll(WeightAndClusters.get(weight)); + + // j=j+1; + } + // System.out.println("done printing keys for centroids"); + + // System.out.println(weights2.size()); + // System.out.println(centroids2.size()); + + //System.out.printf("\tvalueofK is "); + //System.out.println( so.getk()); + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 700;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(20); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv2 rphit = new TWRPv2(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} From 4b74eec7e2e2e04f2432e3709b88fe0afe307d2b Mon Sep 17 00:00:00 2001 From: deysn Date: Thu, 22 Aug 2019 13:15:32 -0400 Subject: [PATCH 06/29] TWRPv3 : TWRPv3 : --- src/main/java/edu/uc/rphash/TWRPv3.java | 5 +- src/main/java/edu/uc/rphash/TWRPv4.java | 440 +++++++++++++ src/main/java/edu/uc/rphash/TWRPv5_WCSS.java | 621 +++++++++++++++++++ 3 files changed, 1063 insertions(+), 3 deletions(-) create mode 100644 src/main/java/edu/uc/rphash/TWRPv4.java create mode 100644 src/main/java/edu/uc/rphash/TWRPv5_WCSS.java diff --git a/src/main/java/edu/uc/rphash/TWRPv3.java b/src/main/java/edu/uc/rphash/TWRPv3.java index ff221d4..b031fcd 100644 --- a/src/main/java/edu/uc/rphash/TWRPv3.java +++ b/src/main/java/edu/uc/rphash/TWRPv3.java @@ -285,7 +285,7 @@ public Multimap findDensityModes2() { { addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec); addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2); - addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec3); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3); } @@ -405,7 +405,6 @@ public Multimap findDensityModes2() { public void run() { rngvec = new float[so.getDimparameter()]; - rngvec2 = new float[so.getDimparameter()]; rngvec3 = new float[so.getDimparameter()]; @@ -532,7 +531,7 @@ public static void main(String[] args) throws FileNotFoundException, // System.out.println("cutoff = "+ o.getCutoff()); // System.out.println("get_random_Vector = "+ o.getRandomVector()); - TWRPv2 rphit = new TWRPv2(o); + TWRPv3 rphit = new TWRPv3(o); long startTime = System.nanoTime(); List centsr = rphit.getCentroids(); diff --git a/src/main/java/edu/uc/rphash/TWRPv4.java b/src/main/java/edu/uc/rphash/TWRPv4.java new file mode 100644 index 0000000..d0bad1f --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv4.java @@ -0,0 +1,440 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + +public class TWRPv4 implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private List centroids = null; + private float[] bisectionVector; + + private RPHashObject so; + + public TWRPv4(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float[][] ret = new float[2][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + } + + //float[] rngvec; the range vector is moot if incoming data has been normalized + //post normalization it should all be zero centered, with variance 1 + /* + * super simple hash algorithm, reminiscient of pstable lsh + */ + // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. + + + public long hashvec2(float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount,int ct, float[] bisectionVector) { + +// for (int i=0 ; i data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(16); + + o.setCutoff(100); + o.setRandomVector(true); +// System.out.println("cutoff = "+ o.getCutoff()); + System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv4 rphit = new TWRPv4(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java new file mode 100644 index 0000000..5f5e8d8 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java @@ -0,0 +1,621 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv5_WCSS implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv5_WCSS(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + + // combines two hashmaps of idsandcents + + public static HashMap mergehmapsidsandcents( + HashMap partidandcent1, + HashMap partidandcent2, + HashMap partidandcount1, + HashMap partidandcount2) +{ + // new empty map + HashMap combined = new HashMap<>(); + combined.putAll( partidandcent1); + + + + for(Long key : partidandcent2.keySet()) { + if(combined.containsKey(key)) { + + + Long weight1= partidandcount1.get(key); + + float[] cent1= combined.get(key); + + Long weight2= partidandcount2.get(key); + + float [] cent2= partidandcent2.get(key); + + float [][] joined = UpdateHashMap(weight1, cent1 ,weight2 , cent2 ); + float combinedCount = joined[0][0]; + float [] combinedCent = joined[1]; + + + combined.put(key,combinedCent); + + } + else { + combined.put(key,partidandcent2.get(key)); + } + } + + return (combined); + +} + + + + // combines two hashmaps of idsandcounts + + public static HashMap mergehmapsidsandcounts(HashMap partidandcount1, + HashMap partidandcountt2) + { + + + HashMap combined = new HashMap (); // new empty map + combined.putAll(partidandcount1); + + + + for(Long key : partidandcountt2.keySet()) { + if(combined.containsKey(key)) { + + + long value1 = combined.get(key); + long value2 = partidandcountt2.get(key); + long value3 = value1 + value2; + + combined.put(key,value3); + + + } + else { + combined.put(key,partidandcountt2.get(key)); + } + + + + } + return (combined); + } + + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + + x_diff_sq[i]= (x_1[i]- x_2[i])*(x_1[i]- x_2[i]); + + + } + + + + float[][] ret = new float[2][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + + + + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, CountForIncomingVector, IncomingVector ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + + + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + + } + } + + + MapOfIDAndCount=mergehmapsidsandcounts(MapOfIDAndCount1, MapOfIDAndCount2); + + MapOfIDAndCent = mergehmapsidsandcents(MapOfIDAndCent1,MapOfIDAndCent2,MapOfIDAndCount1, MapOfIDAndCount2 ); + + MapOfIDAndCent = mergehmapsidsandcents(MapOfIDAndCent,MapOfIDAndCent3,MapOfIDAndCount, MapOfIDAndCount3 ); + + MapOfIDAndCount=mergehmapsidsandcounts(MapOfIDAndCount, MapOfIDAndCount3); + + + + + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + +//HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + +// Stream> stream3 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); +// long counter= stream3.count(); +// System.out.println("NumberOfMicroClustersAfterPruning&limit_the_1s = "+ counter); + +// int cutoff= so.getk()*8; +// if (so.getk()*6 < 210) { cutoff=210+so.getk();} else { cutoff = so.getk()*8;} +// int cutoff = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); +// System.out.println("Cutoff = "+ cutoff); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + +// HashMap KeyAndCent = new HashMap<>(); +// HashMap KeyAndCount = new HashMap<>(); +// Map WeightAndCent = new HashMap<>(); +// Map WeightAndCent = new LinkedHashMap<>(); + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + +// + for (Long keys: sortedIDList2) + + { +// WeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + +// KeyAndCent.put(keys, MapOfIDAndCent.get(keys)); +// KeyAndCount.put(keys, MapOfIDAndCount.get(keys)); + + } + + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + + + Multimap WeightAndClusters = findDensityModes2(); + //Map WeightAndClusters = findDensityModes2(); + + + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + // int k = NumberOfMicroClusters>200+so.getk()?200+so.getk():NumberOfMicroClusters; + + // have to prune depending NumberOfMicroClusters returned. + // int i = 1; + // int j=1; + // for (Long weights : new TreeSet(WeightAndClusters.keySet())) + for (Long weights : WeightAndClusters.keys()) + { + // System.out.println("NumberOfTreesetkeys = "+ i); + // String key =weights.toString(); + // System.out.println(weights); + weights2.add((float)weights); + // centroids2.add(WeightAndClusters.get(weights)); + // centroids2.addAll(WeightAndClusters.get(weights)); + // i=i+1; + } + // System.out.println("done printing keys for weights"); + + for (Long weight : WeightAndClusters.keySet()) + + { + // System.out.println(weight); + // System.out.println("NumberOfTreesetkeys = "+ j); + centroids2.addAll(WeightAndClusters.get(weight)); + + // j=j+1; + } + // System.out.println("done printing keys for centroids"); + + // System.out.println(weights2.size()); + // System.out.println(centroids2.size()); + + //System.out.printf("\tvalueofK is "); + //System.out.println( so.getk()); + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 700;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(20); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv5_WCSS rphit = new TWRPv5_WCSS(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} From 8646ed50adb2d034f5b5baece557e0c7633fa5ff Mon Sep 17 00:00:00 2001 From: deysn Date: Thu, 5 Sep 2019 08:38:42 -0400 Subject: [PATCH 07/29] added the selection of the best tree having best wcss. can select trwp version fron cmd line. --- src/main/java/edu/uc/rphash/RPHash.java | 19 +- src/main/java/edu/uc/rphash/TWRPv5_WCSS.java | 263 +++++++------------ 2 files changed, 112 insertions(+), 170 deletions(-) diff --git a/src/main/java/edu/uc/rphash/RPHash.java b/src/main/java/edu/uc/rphash/RPHash.java index 2098fa5..618031c 100644 --- a/src/main/java/edu/uc/rphash/RPHash.java +++ b/src/main/java/edu/uc/rphash/RPHash.java @@ -51,7 +51,7 @@ public class RPHash { static String[] clusteringmethods = { "simple", "streaming", "multiproj", - "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" ,"twrp"}; + "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" ,"twrp" , "twrpbisect", "twrpbest", "twrpmergetree" }; static String[] offlineclusteringmethods = { "singlelink", "completelink", "averagelink", "kmeans", "adaptivemeanshift", "kmpp", "multikmpp" , "dbscan", "none" }; @@ -757,6 +757,23 @@ public static List runConfigs(List untaggedArgs, break; } + case "twrpmergetree": { + runitems.add(new TWRPv3(o)); + break; + } + + case "twrpbisect": { + runitems.add(new TWRPv4(o)); + break; + } + + case "twrpbest": { + runitems.add(new TWRPv5_WCSS(o)); + break; + } + + + case "dummy": { runitems.add(new DummyClusterer(so)); break; diff --git a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java index 5f5e8d8..62e9e1f 100644 --- a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java +++ b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java @@ -61,86 +61,7 @@ public List getCentroids() { - // combines two hashmaps of idsandcents - - public static HashMap mergehmapsidsandcents( - HashMap partidandcent1, - HashMap partidandcent2, - HashMap partidandcount1, - HashMap partidandcount2) -{ - // new empty map - HashMap combined = new HashMap<>(); - combined.putAll( partidandcent1); - - - - for(Long key : partidandcent2.keySet()) { - if(combined.containsKey(key)) { - - - Long weight1= partidandcount1.get(key); - - float[] cent1= combined.get(key); - - Long weight2= partidandcount2.get(key); - - float [] cent2= partidandcent2.get(key); - - float [][] joined = UpdateHashMap(weight1, cent1 ,weight2 , cent2 ); - float combinedCount = joined[0][0]; - float [] combinedCent = joined[1]; - - - combined.put(key,combinedCent); - - } - else { - combined.put(key,partidandcent2.get(key)); - } - } - - return (combined); - -} - - - - // combines two hashmaps of idsandcounts - - public static HashMap mergehmapsidsandcounts(HashMap partidandcount1, - HashMap partidandcountt2) - { - - - HashMap combined = new HashMap (); // new empty map - combined.putAll(partidandcount1); - - - - for(Long key : partidandcountt2.keySet()) { - if(combined.containsKey(key)) { - - - long value1 = combined.get(key); - long value2 = partidandcountt2.get(key); - long value3 = value1 + value2; - - combined.put(key,value3); - - - } - else { - combined.put(key,partidandcountt2.get(key)); - } - - - - } - return (combined); - } - - + /* /* * X - set of vectors compute the medoid of a vector set @@ -159,32 +80,49 @@ float[] medoid(List X) { } // this updates the map two cents with different weigths are merged into one. - public static float[][] UpdateHashMap(float cnt_1, float[] x_1, - float cnt_2, float[] x_2) { + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { float cnt_r = cnt_1 + cnt_2; float[] x_r = new float[x_1.length]; + float[] var_r1 = new float[x_1.length]; + float[] var_r2 = new float[x_1.length]; + + double var1=0; + double var2=0; + + for (int i = 0; i < x_1.length; i++) { x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; - - x_diff_sq[i]= (x_1[i]- x_2[i])*(x_1[i]- x_2[i]); - - + var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; + + var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; + } - - - float[][] ret = new float[2][]; + for (int i = 0; i < var_r1.length; i++) { + var1 = var1 + var_r1[i]; + + var2 = var2 + var_r2[i]; + } + double wcsse=0; + wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; + + // System.out.println("wcsse = " + wcsse); + + float wcss = (float) wcsse; + + float[][] ret = new float[3][]; ret[0] = new float[1]; ret[0][0] = cnt_r; ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; return ret; - - - + } @@ -204,16 +142,24 @@ public long hashvec2( float[] xt, float[] x, float CurrentCent [] = MapOfIDAndCent.get(s); float CountForIncomingVector = 1; float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + - float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, CountForIncomingVector, IncomingVector ); + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); Long UpdatedCount = (long) MergedValues[0][0] ; float[] MergedVector = MergedValues[1] ; + float wcss= MergedValues[2][0]; + MapOfIDAndCount.put(s , UpdatedCount); MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + } @@ -222,6 +168,7 @@ public long hashvec2( float[] xt, float[] x, float[] xlist = x; MapOfIDAndCent.put(s, xlist); MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); } } return s; @@ -243,12 +190,10 @@ void addtocounter(float[] x, Projector p, } - static boolean isPowerOfTwo(long num) { return (num & -num) == num; } - /* @@ -261,27 +206,20 @@ public Multimap findDensityModes2() { //public Map findDensityModes2() { HashMap MapOfIDAndCent1 = new HashMap<>(); HashMap MapOfIDAndCount1 = new HashMap<>(); - + HashMap MapOfIDAndWCSS1 = new HashMap<>(); HashMap MapOfIDAndCent2 = new HashMap<>(); HashMap MapOfIDAndCount2 = new HashMap<>(); - + HashMap MapOfIDAndWCSS2 = new HashMap<>(); HashMap MapOfIDAndCent3 = new HashMap<>(); HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); HashMap MapOfIDAndCent = new HashMap<>(); HashMap MapOfIDAndCount = new HashMap<>(); - - - HashMap MapOfIDAndWCSS1 = new HashMap<>(); - HashMap MapOfIDAndWCSS2 = new HashMap<>(); - HashMap MapOfIDAndWCSS3 = new HashMap<>(); HashMap MapOfIDAndWCSS = new HashMap<>(); - - - - + // #create projector matrixs @@ -305,19 +243,49 @@ public Multimap findDensityModes2() { } } + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; - MapOfIDAndCount=mergehmapsidsandcounts(MapOfIDAndCount1, MapOfIDAndCount2); - - MapOfIDAndCent = mergehmapsidsandcents(MapOfIDAndCent1,MapOfIDAndCent2,MapOfIDAndCount1, MapOfIDAndCount2 ); - - MapOfIDAndCent = mergehmapsidsandcents(MapOfIDAndCent,MapOfIDAndCent3,MapOfIDAndCount, MapOfIDAndCount3 ); - MapOfIDAndCount=mergehmapsidsandcounts(MapOfIDAndCount, MapOfIDAndCount3); + for (Long cur_id : (MapOfIDAndWCSS1.keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(cur_id);} + for (Long cur_id : (MapOfIDAndWCSS2.keySet())) + + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(cur_id);} + for (Long cur_id : (MapOfIDAndWCSS3.keySet())) + + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} +// System.out.println("wcss1 = " + WCSS1); +// System.out.println("wcss2 = " + WCSS2); +// System.out.println("wcss3 = " + WCSS3); + if ((WCSS1 >= WCSS2) && (WCSS1>=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + System.out.println("winner = tree1"); + } + else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + System.out.println("winner = tree3"); + + } System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); @@ -327,7 +295,6 @@ public Multimap findDensityModes2() { - HashMap denseSetOfIDandCount2 = new HashMap(); for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) { @@ -342,8 +309,6 @@ public Multimap findDensityModes2() { denseSetOfIDandCount2.put(parent_id, 0L); // IDAndCent.put(parent_id, new ArrayList<>()); -//HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); - MapOfIDAndCent.put(parent_id, new float[]{}); // MapOfIDAndCount.put(parent_id, new Long (0)); @@ -372,45 +337,27 @@ public Multimap findDensityModes2() { //remove keys with support less than 1 Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); - //64 so 6 bits? - //stream = stream.filter(p -> p.getKey() > 64); - -// Stream> stream3 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); -// long counter= stream3.count(); -// System.out.println("NumberOfMicroClustersAfterPruning&limit_the_1s = "+ counter); - -// int cutoff= so.getk()*8; -// if (so.getk()*6 < 210) { cutoff=210+so.getk();} else { cutoff = so.getk()*8;} -// int cutoff = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); -// System.out.println("Cutoff = "+ cutoff); + List sortedIDList2= new ArrayList<>(); // sort and limit the list stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) .forEachOrdered(x -> sortedIDList2.add(x.getKey())); -// HashMap KeyAndCent = new HashMap<>(); -// HashMap KeyAndCount = new HashMap<>(); -// Map WeightAndCent = new HashMap<>(); -// Map WeightAndCent = new LinkedHashMap<>(); - Multimap multimapWeightAndCent = ArrayListMultimap.create(); - + Multimap multimapWeightAndCent = ArrayListMultimap.create(); -// + for (Long keys: sortedIDList2) { -// WeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); - -// KeyAndCent.put(keys, MapOfIDAndCent.get(keys)); -// KeyAndCount.put(keys, MapOfIDAndCount.get(keys)); + } - return multimapWeightAndCent; @@ -451,13 +398,8 @@ public void run() { - - Multimap WeightAndClusters = findDensityModes2(); - //Map WeightAndClusters = findDensityModes2(); - - Listcentroids2 = new ArrayList<>(); @@ -467,40 +409,23 @@ public void run() { System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); System.out.println("getRandomVector = "+ randVect); - // int k = NumberOfMicroClusters>200+so.getk()?200+so.getk():NumberOfMicroClusters; - - // have to prune depending NumberOfMicroClusters returned. - // int i = 1; - // int j=1; - // for (Long weights : new TreeSet(WeightAndClusters.keySet())) + for (Long weights : WeightAndClusters.keys()) { - // System.out.println("NumberOfTreesetkeys = "+ i); - // String key =weights.toString(); - // System.out.println(weights); + weights2.add((float)weights); - // centroids2.add(WeightAndClusters.get(weights)); - // centroids2.addAll(WeightAndClusters.get(weights)); - // i=i+1; + } - // System.out.println("done printing keys for weights"); + for (Long weight : WeightAndClusters.keySet()) { - // System.out.println(weight); - // System.out.println("NumberOfTreesetkeys = "+ j); + centroids2.addAll(WeightAndClusters.get(weight)); - // j=j+1; } - // System.out.println("done printing keys for centroids"); - - // System.out.println(weights2.size()); - // System.out.println(centroids2.size()); - - //System.out.printf("\tvalueofK is "); - //System.out.println( so.getk()); + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); @@ -515,8 +440,8 @@ public static void main(String[] args) throws FileNotFoundException, IOException { int k = 10;//6; - int d = 700;//16; - int n = 10000; + int d = 200;//16; + int n = 100000; float var = 1.5f; int count = 1; // System.out.printf("ClusterVar\t"); @@ -538,7 +463,7 @@ public static void main(String[] args) throws FileNotFoundException, RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDimparameter(20); + o.setDimparameter(16); o.setCutoff(100); o.setRandomVector(true); From c7f92944c137eb0d7073ff4f63b9058dcf114c58 Mon Sep 17 00:00:00 2001 From: deysn Date: Thu, 17 Oct 2019 16:13:04 -0400 Subject: [PATCH 08/29] algorithms to choose best tree --- src/main/java/edu/uc/rphash/RPHash.java | 21 +- .../uc/rphash/RPHashSimple_multiPosLsh.java | 384 +++++++++++ src/main/java/edu/uc/rphash/TWRPv5_WCSS.java | 3 +- src/main/java/edu/uc/rphash/TWRPv6_COV.java | 549 +++++++++++++++ src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java | 626 ++++++++++++++++++ .../edu/uc/rphash/TWRPv6_meanVariance.java | 549 +++++++++++++++ 6 files changed, 2128 insertions(+), 4 deletions(-) create mode 100644 src/main/java/edu/uc/rphash/RPHashSimple_multiPosLsh.java create mode 100644 src/main/java/edu/uc/rphash/TWRPv6_COV.java create mode 100644 src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java create mode 100644 src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java diff --git a/src/main/java/edu/uc/rphash/RPHash.java b/src/main/java/edu/uc/rphash/RPHash.java index 618031c..b0ed7be 100644 --- a/src/main/java/edu/uc/rphash/RPHash.java +++ b/src/main/java/edu/uc/rphash/RPHash.java @@ -51,7 +51,8 @@ public class RPHash { static String[] clusteringmethods = { "simple", "streaming", "multiproj", - "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" ,"twrp" , "twrpbisect", "twrpbest", "twrpmergetree" }; + "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" ,"twrp" , "twrpbisect", "twrpbest", "twrpmergetree", "twrpbest_afterpruning", + "twrpbest_cov","twrpbest_meanvariance" }; static String[] offlineclusteringmethods = { "singlelink", "completelink", "averagelink", "kmeans", "adaptivemeanshift", "kmpp", "multikmpp" , "dbscan", "none" }; @@ -772,7 +773,23 @@ public static List runConfigs(List untaggedArgs, break; } - + case "twrpbest_afterpruning": { + runitems.add(new TWRPv6_WCSS2(o)); + break; + } + + case "twrpbest_cov": { + runitems.add(new TWRPv6_COV(o)); + break; + } + + case "twrpbest_meanvariance": { + runitems.add(new TWRPv6_meanVariance(o)); + break; + } + + + case "dummy": { runitems.add(new DummyClusterer(so)); diff --git a/src/main/java/edu/uc/rphash/RPHashSimple_multiPosLsh.java b/src/main/java/edu/uc/rphash/RPHashSimple_multiPosLsh.java new file mode 100644 index 0000000..7a37edf --- /dev/null +++ b/src/main/java/edu/uc/rphash/RPHashSimple_multiPosLsh.java @@ -0,0 +1,384 @@ +package edu.uc.rphash; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.function.Consumer; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.decoders.Decoder; +import edu.uc.rphash.decoders.DepthProbingLSH; +import edu.uc.rphash.decoders.Leech; +import edu.uc.rphash.decoders.Spherical; +import edu.uc.rphash.decoders.SphericalRandom; +import edu.uc.rphash.frequentItemSet.ItemSet; +import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; +import edu.uc.rphash.lsh.LSH; +//import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.standardhash.HashAlgorithm; +import edu.uc.rphash.standardhash.NoHash; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.MultiKMPP; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.tests.generators.GenerateStreamData; +import edu.uc.rphash.tests.kmeanspp.KMeansPlusPlus; +import edu.uc.rphash.util.VectorUtil; + +public class RPHashSimple_multiPosLsh implements Clusterer { + // float variance; + + public ItemSet is; + + List labels; + HashMap labelmap; + + public static void mapfunc(float[] vec, LSH lshfunc, ItemSet is) { + + long hash = lshfunc.lshHash(vec); + is.add(hash); + } + + public RPHashObject map() { + + // create our LSH Machine + HashAlgorithm hal = new NoHash(so.getHashmod()); + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + + int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and + // round to + // integer + int k1 = so.getk() * logk; + is = new SimpleFrequentItemSet(k1); + Decoder dec = so.getDecoderType(); + dec.setCounter(is); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + // no noise to start with + List noise = LSH.genNoiseTable( + dec.getDimensionality(), + so.getNumBlur(), + new Random(), + dec.getErrorRadius() + / (dec.getDimensionality() * dec.getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + + // add to frequent itemset the hashed Decoded randomly projected vector + + if (so.getParallel()) { + List data = so.getRawData(); + data.parallelStream().forEach(new Consumer() { + + @Override + public void accept(float[] t) { + mapfunc(t, lshfunc, is); + } + }); + } + while (vecs.hasNext()) { + mapfunc(vecs.next(), lshfunc, is); + } + // } + // while (vecs.hasNext()) { + // float[] vec = vecs.next(); + // long hash = lshfunc.lshHash(vec); + // is.add(hash); + // + // } + + List topids = is.getTop(); + so.setPreviousTopID(topids); + + List topsizes = is.getCounts(); + + List countsAsFloats = new ArrayList(); + for (long ct : topsizes) + countsAsFloats.add((float) ct); + so.setCounts(countsAsFloats); + return so; + } + + public static void redFunc(float[] vec, LSH lshfunc, List noise, + List labels, List centroids) { + long[] hash = lshfunc.lshHashRadius(vec, noise); + labels.add(-1l); + // radius probe around the vector + for (Centroid cent : centroids) { + for (long h : hash) { + if (cent.ids.contains(h)) { + cent.updateVec(vec); + labels.set(labels.size() - 1, cent.id); + } + } + } + } + + /* + * This is the second phase after the top ids have been in the reduce phase + * aggregated + */ + public RPHashObject reduce() { + + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + float[] vec = vecs.next(); + + HashAlgorithm hal = new NoHash(so.getHashmod()); + Decoder dec = so.getDecoderType(); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + + List noise = LSH.genNoiseTable( + so.getdim(), + so.getNumBlur(), + new Random(so.getRandomSeed()), + (float) (dec.getErrorRadius()) + / (float) (dec.getDimensionality() * dec + .getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + List centroids = new ArrayList(); + + for (long id : so.getPreviousTopID()) { + centroids.add(new Centroid(so.getdim(), id, -1)); + } + + this.labels = new ArrayList<>(); + + if (so.getParallel()) { + try { + List data = so.getRawData(); + ForkJoinPool myPool = new ForkJoinPool(this.threads); + myPool.submit(() -> + + data.parallelStream().forEach(new Consumer() { + + @Override + public void accept(float[] t) { + redFunc(t, lshfunc, noise, labels, centroids); + } + + })).get(); + + } catch (InterruptedException | ExecutionException e) { + e.printStackTrace(); + } + + } else { + while (vecs.hasNext()) { + redFunc(vecs.next(), lshfunc, noise, labels, centroids); + } + } + + // while (vecs.hasNext()) + // { + // + // long[] hash = lshfunc.lshHashRadius(vec, noise); + // labels.add(-1l); + // //radius probe around the vector + // for (Centroid cent : centroids) { + // for (long h : hash) + // { + // if (cent.ids.contains(h)) { + // cent.updateVec(vec); + // this.labels.set(labels.size()-1,cent.id); + // } + // } + // } + // vec = vecs.next(); + // + // } + + + Clusterer offlineclusterer = so.getOfflineClusterer(); + offlineclusterer.setData(centroids); + offlineclusterer.setWeights(so.getCounts()); + offlineclusterer.setK(so.getk()); + + // System.out.println("\n k sent to offline = "+ so.getk()); + + this.centroids = offlineclusterer.getCentroids(); + + //System.out.println("\n cents in reduce from offline cluster = "+ this.centroids.size()); + + //System.out.println("\n cents in reduce after label mapping = "+ centroids.size()); + + this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids); + + //so.setCentroids(centroids); + so.setCentroids(this.centroids); + + + + return so; + } + + // 271458 + // 264779.7 + + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + private List centroids = null; + private RPHashObject so; + + public RPHashSimple_multiPosLsh(List data, int k) { + so = new SimpleArrayReader(data, k); + } + + int threads = 1; + + public RPHashSimple_multiPosLsh(List data, int k, int processors) { + // System.setProperty("java.util.concurrent.ForkJoinPool.common.parallelism",String.valueOf(processors)); + threads = processors; + so = new SimpleArrayReader(data, k); + so.setParallel(true); + } + + public RPHashSimple_multiPosLsh(List data, int k, int times, int rseed) { + so = new SimpleArrayReader(data, k); + } + + public RPHashSimple_multiPosLsh(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + if (centroids == null) + run(); + return centroids; + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + + return centroids; + } + + private void run() { + map(); + reduce(); + this.centroids = so.getCentroids(); + + + } + + public static void main(String[] args) { + int k = 10; + int d = 200; + int n = 1000; + float var = 1f; + int count = 5; + System.out.printf("Decoder: %s\n", "Sphere"); + System.out.printf("ClusterVar\t"); + for (int i = 0; i < count; i++) + System.out.printf("Trial%d\t", i); + System.out.printf("RealWCSS\n"); + + for (float f = var; f < 3.01; f += .05f) { + float avgrealwcss = 0; + float avgtime = 0; + System.out.printf("%f\t", f); + for (int i = 0; i < count; i++) { + GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); + RPHashObject o = new SimpleArrayReader(gen.data, k); + RPHashSimple_multiPosLsh rphit = new RPHashSimple_multiPosLsh(o); + o.setDecoderType(new SphericalRandom(32, 4, 1)); + //o.setDecoderType(new Spherical(32, 4, 1)); + // o.setDimparameter(31); + //o.setOfflineClusterer(new KMeans2()); + o.setOfflineClusterer(new MultiKMPP()); + + //System.out.println("\n k sent to offline in MAIN = "+ o.getk()); + + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + //System.out.println("\n no of final cents : " + centsr.size()); + + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + System.out.printf("%.0f\t", + StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + } + System.out.printf("%.0f\n", avgrealwcss / count); + } + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + // TODO Auto-generated method stub + + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return true; + } +} diff --git a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java index 62e9e1f..4559d38 100644 --- a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java +++ b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java @@ -59,8 +59,7 @@ public List getCentroids() { return centroids; } - - + /* /* diff --git a/src/main/java/edu/uc/rphash/TWRPv6_COV.java b/src/main/java/edu/uc/rphash/TWRPv6_COV.java new file mode 100644 index 0000000..f30e517 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_COV.java @@ -0,0 +1,549 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv6_COV implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_COV(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + float[] var_r1 = new float[x_1.length]; + float[] var_r2 = new float[x_1.length]; + + double var1=0; + double var2=0; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/(x_r[i]*1000000000); + + var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/(x_r[i]*1000000000); + + } + + for (int i = 0; i < var_r1.length; i++) { + var1 = var1 + var_r1[i]; + + var2 = var2 + var_r2[i]; + } + double wcsse=0; +// wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; + + wcsse = ( cnt_1*(wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; + + wcsse = wcsse/(cnt_r); + + // System.out.println("wcsse = " + wcsse); + + float wcss = (float) wcsse; + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + + } + } + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + + for (Long cur_id : (MapOfIDAndWCSS1.keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(cur_id);} + + for (Long cur_id : (MapOfIDAndWCSS2.keySet())) + + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(cur_id);} + + for (Long cur_id : (MapOfIDAndWCSS3.keySet())) + + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} + +// System.out.println("wcss1 = " + WCSS1); +// System.out.println("wcss2 = " + WCSS2); +// System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 >= WCSS2) && (WCSS1>=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + System.out.println("winner = tree1"); + } + else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + System.out.println("winner = tree3"); + + } + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 100000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(16); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_COV rphit = new TWRPv6_COV(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java b/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java new file mode 100644 index 0000000..be50273 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java @@ -0,0 +1,626 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv6_WCSS2 implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_WCSS2(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + float[] var_r1 = new float[x_1.length]; + float[] var_r2 = new float[x_1.length]; + + double var1=0; + double var2=0; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; + + var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; + + } + + for (int i = 0; i < var_r1.length; i++) { + var1 = var1 + var_r1[i]; + + var2 = var2 + var_r2[i]; + } + double wcsse=0; + wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; + + // System.out.println("wcsse = " + wcsse); + + float wcss = (float) wcsse; + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + + } + } + + + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + for (Long cur_id : (denseSetOfIDandCount2_1.keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(cur_id);} + + for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(cur_id);} + + for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} + +// System.out.println("wcss1 = " + WCSS1); +// System.out.println("wcss2 = " + WCSS2); +// System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 >= WCSS2) && (WCSS1>=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 100000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(16); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_WCSS2 rphit = new TWRPv6_WCSS2(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java b/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java new file mode 100644 index 0000000..a04f241 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java @@ -0,0 +1,549 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv6_meanVariance implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_meanVariance(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + float[] var_r1 = new float[x_1.length]; + float[] var_r2 = new float[x_1.length]; + + double var1=0; + double var2=0; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; + + var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; + + } + + for (int i = 0; i < var_r1.length; i++) { + var1 = var1 + var_r1[i]; + + var2 = var2 + var_r2[i]; + } + double wcsse=0; +// wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; + + wcsse = ( cnt_1*(wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; + + wcsse = wcsse/cnt_r; + + // System.out.println("wcsse = " + wcsse); + + float wcss = (float) wcsse; + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + + } + } + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + + for (Long cur_id : (MapOfIDAndWCSS1.keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(cur_id);} + + for (Long cur_id : (MapOfIDAndWCSS2.keySet())) + + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(cur_id);} + + for (Long cur_id : (MapOfIDAndWCSS3.keySet())) + + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} + +// System.out.println("wcss1 = " + WCSS1); +// System.out.println("wcss2 = " + WCSS2); +// System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 >= WCSS2) && (WCSS1>=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + System.out.println("winner = tree1"); + } + else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + System.out.println("winner = tree3"); + + } + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 100000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(16); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_meanVariance rphit = new TWRPv6_meanVariance(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} From af43d9b7a484aea6e6e2570d43a27fe6f5d14698 Mon Sep 17 00:00:00 2001 From: deysn Date: Thu, 7 Nov 2019 09:21:18 -0500 Subject: [PATCH 09/29] changed the wcss to calculate squared distance. updated the online wcss calculation. --- src/main/java/edu/uc/rphash/TWRPv5_WCSS.java | 67 ++-- src/main/java/edu/uc/rphash/TWRPv6_COV.java | 275 +++++++++++----- src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java | 120 ++++--- .../edu/uc/rphash/TWRPv6_meanVariance.java | 309 ++++++++++++------ .../java/edu/uc/rphash/tests/StatTests.java | 2 +- .../uc/rphash/tests/clusterers/KMeans2.java | 4 +- .../java/edu/uc/rphash/util/VectorUtil.java | 11 + 7 files changed, 534 insertions(+), 254 deletions(-) diff --git a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java index 4559d38..54d3f7a 100644 --- a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java +++ b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java @@ -60,7 +60,22 @@ public List getCentroids() { } - /* + + // This function returns the square of the euclidean distance. + public static float distance(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + + /* * X - set of vectors compute the medoid of a vector set @@ -86,34 +101,28 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, float[] x_r = new float[x_1.length]; - float[] var_r1 = new float[x_1.length]; - float[] var_r2 = new float[x_1.length]; - - double var1=0; - double var2=0; - for (int i = 0; i < x_1.length; i++) { x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; - - var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; - - var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; - + } - for (int i = 0; i < var_r1.length; i++) { - var1 = var1 + var_r1[i]; - - var2 = var2 + var_r2[i]; - } - double wcsse=0; - wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; - - // System.out.println("wcsse = " + wcsse); - - float wcss = (float) wcsse; + float wcss = distance(x_r,x_2) + wcss_1; + + +// float wcss = ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_1); + +// float wcss = ( ((wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_r) ); + +// float dissq= distance(x_1,x_2); +// float wcss = wcss_1 + dissq - (dissq/cnt_r) ; + + +// System.out.println("wcss = " + wcss); + float[][] ret = new float[3][]; ret[0] = new float[1]; ret[0][0] = cnt_r; @@ -262,17 +271,17 @@ public Multimap findDensityModes2() { { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} -// System.out.println("wcss1 = " + WCSS1); -// System.out.println("wcss2 = " + WCSS2); -// System.out.println("wcss3 = " + WCSS3); + System.out.println("wcss1 = " + WCSS1); + System.out.println("wcss2 = " + WCSS2); + System.out.println("wcss3 = " + WCSS3); - if ((WCSS1 >= WCSS2) && (WCSS1>=WCSS3)) + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) {MapOfIDAndCount = MapOfIDAndCount1; MapOfIDAndCent = MapOfIDAndCent1; MapOfIDAndWCSS = MapOfIDAndWCSS1; System.out.println("winner = tree1"); } - else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) + else if ((WCSS2 <= WCSS1) && (WCSS2 <= WCSS3)) {MapOfIDAndCount = MapOfIDAndCount2; MapOfIDAndCent = MapOfIDAndCent2; MapOfIDAndWCSS = MapOfIDAndWCSS2; @@ -440,7 +449,7 @@ public static void main(String[] args) throws FileNotFoundException, int k = 10;//6; int d = 200;//16; - int n = 100000; + int n = 10000; float var = 1.5f; int count = 1; // System.out.printf("ClusterVar\t"); diff --git a/src/main/java/edu/uc/rphash/TWRPv6_COV.java b/src/main/java/edu/uc/rphash/TWRPv6_COV.java index f30e517..6127087 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_COV.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_COV.java @@ -59,6 +59,21 @@ public List getCentroids() { return centroids; } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + /* @@ -78,7 +93,7 @@ float[] medoid(List X) { return ret; } -// this updates the map two cents with different weigths are merged into one. +// this updates the map two cents with different weights are merged into one. public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, float cnt_2, float[] x_2 , float wcss_2) { @@ -86,44 +101,23 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, float[] x_r = new float[x_1.length]; - float[] var_r1 = new float[x_1.length]; - float[] var_r2 = new float[x_1.length]; - - double var1=0; - double var2=0; - for (int i = 0; i < x_1.length; i++) { x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; - - var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/(x_r[i]*1000000000); - - var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/(x_r[i]*1000000000); - + } - - for (int i = 0; i < var_r1.length; i++) { - var1 = var1 + var_r1[i]; - var2 = var2 + var_r2[i]; - } - double wcsse=0; -// wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; - - wcsse = ( cnt_1*(wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; - - wcsse = wcsse/(cnt_r); - - // System.out.println("wcsse = " + wcsse); - - float wcss = (float) wcsse; + + float wcss_cov = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); +// wcss_cov = wcss_cov/; +// System.out.println("wcsse = " + wcss); float[][] ret = new float[3][]; ret[0] = new float[1]; ret[0][0] = cnt_r; ret[1] = x_r; ret[2]= new float [1]; - ret[2][0]= wcss; + ret[2][0]= wcss_cov; return ret; @@ -219,10 +213,6 @@ public Multimap findDensityModes2() { HashMap MapOfIDAndCount3 = new HashMap<>(); HashMap MapOfIDAndWCSS3 = new HashMap<>(); - HashMap MapOfIDAndCent = new HashMap<>(); - HashMap MapOfIDAndCount = new HashMap<>(); - HashMap MapOfIDAndWCSS = new HashMap<>(); - // #create projector matrixs @@ -247,100 +237,213 @@ public Multimap findDensityModes2() { } } - - float WCSS1 = 0; - float WCSS2 = 0; - float WCSS3 = 0; - - - for (Long cur_id : (MapOfIDAndWCSS1.keySet())) - { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); - WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(cur_id);} + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); - for (Long cur_id : (MapOfIDAndWCSS2.keySet())) - - { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(cur_id);} + // next we want to prune the tree by parent count comparison + // follows breadthfirst search - for (Long cur_id : (MapOfIDAndWCSS3.keySet())) - - { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} -// System.out.println("wcss1 = " + WCSS1); -// System.out.println("wcss2 = " + WCSS2); -// System.out.println("wcss3 = " + WCSS3); - if ((WCSS1 >= WCSS2) && (WCSS1>=WCSS3)) - {MapOfIDAndCount = MapOfIDAndCount1; - MapOfIDAndCent = MapOfIDAndCent1; - MapOfIDAndWCSS = MapOfIDAndWCSS1; - System.out.println("winner = tree1"); - } - else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) - {MapOfIDAndCount = MapOfIDAndCount2; - MapOfIDAndCent = MapOfIDAndCent2; - MapOfIDAndWCSS = MapOfIDAndWCSS2; - System.out.println("winner = tree2"); + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } } - else - {MapOfIDAndCount = MapOfIDAndCount3; - MapOfIDAndCent = MapOfIDAndCent3; - MapOfIDAndWCSS = MapOfIDAndWCSS3; - System.out.println("winner = tree3"); - - } - - System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); - // next we want to prune the tree by parent count comparison - // follows breadthfirst search + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } - HashMap denseSetOfIDandCount2 = new HashMap(); - for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) { if (cur_id >so.getk()){ - int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); long parent_id = cur_id>>>1; - int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); if(cur_count!=0 && parent_count!=0) { if(cur_count == parent_count) { - denseSetOfIDandCount2.put(parent_id, 0L); + denseSetOfIDandCount2_3.put(parent_id, 0L); // IDAndCent.put(parent_id, new ArrayList<>()); - MapOfIDAndCent.put(parent_id, new float[]{}); + MapOfIDAndCent3.put(parent_id, new float[]{}); // MapOfIDAndCount.put(parent_id, new Long (0)); - denseSetOfIDandCount2.put(cur_id, (long) cur_count); + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); } else { if(2 * cur_count > parent_count) { - denseSetOfIDandCount2.remove(parent_id); + denseSetOfIDandCount2_3.remove(parent_id); // IDAndCent.put(parent_id, new ArrayList<>()); - MapOfIDAndCent.put(parent_id, new float[]{}); + MapOfIDAndCent3.put(parent_id, new float[]{}); // MapOfIDAndCount.put(parent_id, new Long (0)); - denseSetOfIDandCount2.put(cur_id, (long) cur_count); + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); } } } } - } + } + + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + System.out.println("wcss1 = " + WCSS1); + System.out.println("wcss2 = " + WCSS2); + System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); //remove keys with support less than 1 Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); - List sortedIDList2= new ArrayList<>(); // sort and limit the list @@ -444,7 +547,7 @@ public static void main(String[] args) throws FileNotFoundException, int k = 10;//6; int d = 200;//16; - int n = 100000; + int n = 10000; float var = 1.5f; int count = 1; // System.out.printf("ClusterVar\t"); @@ -474,7 +577,7 @@ public static void main(String[] args) throws FileNotFoundException, // System.out.println("cutoff = "+ o.getCutoff()); // System.out.println("get_random_Vector = "+ o.getRandomVector()); - TWRPv6_COV rphit = new TWRPv6_COV(o); + TWRPv6_WCSS2 rphit = new TWRPv6_WCSS2(o); long startTime = System.nanoTime(); List centsr = rphit.getCentroids(); diff --git a/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java b/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java index be50273..0a5ab56 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java @@ -59,6 +59,21 @@ public List getCentroids() { return centroids; } + + +// This function returns the square of the euclidean distance. + public static float distance(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + /* @@ -78,41 +93,36 @@ float[] medoid(List X) { return ret; } -// this updates the map two cents with different weigths are merged into one. +// this updates the map two cents with different weights are merged into one. public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, - float cnt_2, float[] x_2 , float wcss_2) { + float cnt_2, float[] x_2 , float wcss_2) { // incoming vector float cnt_r = cnt_1 + cnt_2; float[] x_r = new float[x_1.length]; - float[] var_r1 = new float[x_1.length]; - float[] var_r2 = new float[x_1.length]; - - double var1=0; - double var2=0; - - for (int i = 0; i < x_1.length; i++) { x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; - - var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; - - var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; - + } - - for (int i = 0; i < var_r1.length; i++) { - var1 = var1 + var_r1[i]; - var2 = var2 + var_r2[i]; - } - double wcsse=0; - wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; - - // System.out.println("wcsse = " + wcsse); + +// float wcss = ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_1); + +// float wcss = ( ((wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_r) ); + + float dissq= distance(x_1,x_2); + float wcss = wcss_1 + dissq - (dissq/cnt_r) ; + + + + + + - float wcss = (float) wcsse; +// System.out.println("wcsse = " + wcss); float[][] ret = new float[3][]; ret[0] = new float[1]; @@ -239,8 +249,6 @@ public Multimap findDensityModes2() { } } - - System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); @@ -365,6 +373,35 @@ public Multimap findDensityModes2() { } } + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + float WCSS1 = 0; float WCSS2 = 0; float WCSS3 = 0; @@ -373,31 +410,33 @@ public Multimap findDensityModes2() { HashMap MapOfIDAndCount = new HashMap<>(); HashMap MapOfIDAndWCSS = new HashMap<>(); - for (Long cur_id : (denseSetOfIDandCount2_1.keySet())) + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); - WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(cur_id);} + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} - for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) - - { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(cur_id);} +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} - for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) - - { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} -// System.out.println("wcss1 = " + WCSS1); -// System.out.println("wcss2 = " + WCSS2); -// System.out.println("wcss3 = " + WCSS3); + System.out.println("wcss1 = " + WCSS1); + System.out.println("wcss2 = " + WCSS2); + System.out.println("wcss3 = " + WCSS3); - if ((WCSS1 >= WCSS2) && (WCSS1>=WCSS3)) + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) {MapOfIDAndCount = MapOfIDAndCount1; MapOfIDAndCent = MapOfIDAndCent1; MapOfIDAndWCSS = MapOfIDAndWCSS1; denseSetOfIDandCount2 = denseSetOfIDandCount2_1; System.out.println("winner = tree1"); } - else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) {MapOfIDAndCount = MapOfIDAndCount2; MapOfIDAndCent = MapOfIDAndCent2; MapOfIDAndWCSS = MapOfIDAndWCSS2; @@ -417,7 +456,6 @@ else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) //remove keys with support less than 1 Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); - List sortedIDList2= new ArrayList<>(); // sort and limit the list @@ -521,7 +559,7 @@ public static void main(String[] args) throws FileNotFoundException, int k = 10;//6; int d = 200;//16; - int n = 100000; + int n = 10000; float var = 1.5f; int count = 1; // System.out.printf("ClusterVar\t"); diff --git a/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java b/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java index a04f241..9196e44 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java @@ -20,6 +20,7 @@ import edu.uc.rphash.projections.Projector; import edu.uc.rphash.tests.StatTests; import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; import edu.uc.rphash.tests.generators.GenerateData; import edu.uc.rphash.util.VectorUtil; @@ -59,6 +60,21 @@ public List getCentroids() { return centroids; } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + /* @@ -78,7 +94,7 @@ float[] medoid(List X) { return ret; } -// this updates the map two cents with different weigths are merged into one. +// this updates the map two cents with different weights are merged into one. public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, float cnt_2, float[] x_2 , float wcss_2) { @@ -86,37 +102,22 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, float[] x_r = new float[x_1.length]; - float[] var_r1 = new float[x_1.length]; - float[] var_r2 = new float[x_1.length]; - - double var1=0; - double var2=0; - for (int i = 0; i < x_1.length; i++) { x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; - - var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; - - var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; - + } - - for (int i = 0; i < var_r1.length; i++) { - var1 = var1 + var_r1[i]; - var2 = var2 + var_r2[i]; - } - double wcsse=0; -// wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; - - wcsse = ( cnt_1*(wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; - - wcsse = wcsse/cnt_r; - - // System.out.println("wcsse = " + wcsse); +// float wcss = (distance(x_r,x_2)/cnt_r) + wcss_1; + +// float wcss = ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_1); + +// float wcss = ( ((wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ); + + float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); + - float wcss = (float) wcsse; +// System.out.println("wcsse = " + wcss); float[][] ret = new float[3][]; ret[0] = new float[1]; @@ -219,10 +220,6 @@ public Multimap findDensityModes2() { HashMap MapOfIDAndCount3 = new HashMap<>(); HashMap MapOfIDAndWCSS3 = new HashMap<>(); - HashMap MapOfIDAndCent = new HashMap<>(); - HashMap MapOfIDAndCount = new HashMap<>(); - HashMap MapOfIDAndWCSS = new HashMap<>(); - // #create projector matrixs @@ -247,100 +244,213 @@ public Multimap findDensityModes2() { } } - - float WCSS1 = 0; - float WCSS2 = 0; - float WCSS3 = 0; - - - for (Long cur_id : (MapOfIDAndWCSS1.keySet())) - { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); - WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(cur_id);} + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); - for (Long cur_id : (MapOfIDAndWCSS2.keySet())) - - { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(cur_id);} + // next we want to prune the tree by parent count comparison + // follows breadthfirst search - for (Long cur_id : (MapOfIDAndWCSS3.keySet())) - - { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} -// System.out.println("wcss1 = " + WCSS1); -// System.out.println("wcss2 = " + WCSS2); -// System.out.println("wcss3 = " + WCSS3); - if ((WCSS1 >= WCSS2) && (WCSS1>=WCSS3)) - {MapOfIDAndCount = MapOfIDAndCount1; - MapOfIDAndCent = MapOfIDAndCent1; - MapOfIDAndWCSS = MapOfIDAndWCSS1; - System.out.println("winner = tree1"); - } - else if ((WCSS2 >= WCSS1) && (WCSS2>=WCSS3)) - {MapOfIDAndCount = MapOfIDAndCount2; - MapOfIDAndCent = MapOfIDAndCent2; - MapOfIDAndWCSS = MapOfIDAndWCSS2; - System.out.println("winner = tree2"); + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } } - else - {MapOfIDAndCount = MapOfIDAndCount3; - MapOfIDAndCent = MapOfIDAndCent3; - MapOfIDAndWCSS = MapOfIDAndWCSS3; - System.out.println("winner = tree3"); - - } - - System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); - // next we want to prune the tree by parent count comparison - // follows breadthfirst search + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } - HashMap denseSetOfIDandCount2 = new HashMap(); - for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) { if (cur_id >so.getk()){ - int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); long parent_id = cur_id>>>1; - int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); if(cur_count!=0 && parent_count!=0) { if(cur_count == parent_count) { - denseSetOfIDandCount2.put(parent_id, 0L); + denseSetOfIDandCount2_3.put(parent_id, 0L); // IDAndCent.put(parent_id, new ArrayList<>()); - MapOfIDAndCent.put(parent_id, new float[]{}); + MapOfIDAndCent3.put(parent_id, new float[]{}); // MapOfIDAndCount.put(parent_id, new Long (0)); - denseSetOfIDandCount2.put(cur_id, (long) cur_count); + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); } else { if(2 * cur_count > parent_count) { - denseSetOfIDandCount2.remove(parent_id); + denseSetOfIDandCount2_3.remove(parent_id); // IDAndCent.put(parent_id, new ArrayList<>()); - MapOfIDAndCent.put(parent_id, new float[]{}); + MapOfIDAndCent3.put(parent_id, new float[]{}); // MapOfIDAndCount.put(parent_id, new Long (0)); - denseSetOfIDandCount2.put(cur_id, (long) cur_count); + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); } } } } - } + } + + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + System.out.println("wcss1 = " + WCSS1); + System.out.println("wcss2 = " + WCSS2); + System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); //remove keys with support less than 1 Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); - List sortedIDList2= new ArrayList<>(); // sort and limit the list @@ -430,7 +540,7 @@ public void run() { } - Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); aggloOffline.setWeights(weights2); @@ -444,7 +554,7 @@ public static void main(String[] args) throws FileNotFoundException, int k = 10;//6; int d = 200;//16; - int n = 100000; + int n = 10000; float var = 1.5f; int count = 1; // System.out.printf("ClusterVar\t"); @@ -458,17 +568,22 @@ public static void main(String[] args) throws FileNotFoundException, float avgrealwcss = 0; float avgtime = 0; // System.out.printf("%f\t", f); - GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); - // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/1D.txt", raw); + k = 6; + RPHashObject o = new SimpleArrayReader(data, 6); + - // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" - - RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDimparameter(16); - - o.setCutoff(100); + o.setCutoff(60); o.setRandomVector(true); // System.out.println("cutoff = "+ o.getCutoff()); @@ -480,12 +595,16 @@ public static void main(String[] args) throws FileNotFoundException, avgtime += (System.nanoTime() - startTime) / 100000000; - avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), - gen.getData()); +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + System.gc(); System.out.printf("%.0f\n", avgrealwcss / count); diff --git a/src/main/java/edu/uc/rphash/tests/StatTests.java b/src/main/java/edu/uc/rphash/tests/StatTests.java index 6b7a927..6d9aecd 100644 --- a/src/main/java/edu/uc/rphash/tests/StatTests.java +++ b/src/main/java/edu/uc/rphash/tests/StatTests.java @@ -110,7 +110,7 @@ public static double WCSSECentroidsFloat(List estCentroids, List Date: Thu, 12 Mar 2020 09:27:22 -0400 Subject: [PATCH 10/29] testing for automatic cluster detection and choosing the best tree version --- src/main/java/edu/uc/rphash/TWRPv6_COV.java | 1 + .../edu/uc/rphash/TWRPv6_wcss_offline.java | 775 ++++++++ .../uc/rphash/TWRPv6_wcss_offline2_TEST.java | 910 ++++++++++ .../TWRPv6_wcss_offline2_TEST2_10runs.java | 1596 +++++++++++++++++ .../TWRPv6_wcss_offline2_TEST2_5runs.java | 1062 +++++++++++ .../uc/rphash/tests/clusterers/DBScan.java | 13 +- 6 files changed, 4354 insertions(+), 3 deletions(-) create mode 100644 src/main/java/edu/uc/rphash/TWRPv6_wcss_offline.java create mode 100644 src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java create mode 100644 src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java create mode 100644 src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_5runs.java diff --git a/src/main/java/edu/uc/rphash/TWRPv6_COV.java b/src/main/java/edu/uc/rphash/TWRPv6_COV.java index 6127087..ecfce02 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_COV.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_COV.java @@ -118,6 +118,7 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, ret[1] = x_r; ret[2]= new float [1]; ret[2][0]= wcss_cov; +// ret[3][0]= distance; return ret; diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline.java new file mode 100644 index 0000000..e50785f --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline.java @@ -0,0 +1,775 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 3 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which hass the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } +// float wcss = (distancesq(x_r,x_2)/cnt_r) + wcss_1; + +// float wcss = ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_1); + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + +// this method is used to calculate the offline wcss +// UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); + + public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { + + float wcss = wcss_1 + distancesq(x_1,x_2); + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; +// ret[0][0] = cnt_r; +// ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + +// this hash is to calculate the wcss +// hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); + + public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCent [] = MapOfIDAndCent.get(s); + float IncomingVector [] = x; + + + float currentWcss= 0; + + if (IDandWCSS_offline.containsKey(s)) { + currentWcss= IDandWCSS_offline.get(s); + } + + float[][] MergedValues = UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector ); + + float wcss= MergedValues[2][0]; + + + IDandWCSS_offline.put(s, wcss); + + + } + } + return s; + } + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + // this method is used to compute the offline WCSS to choose the best of the clusters + //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); + + void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { + + float[] xt = p.project(x); + + hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); + + } + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + float WCSS_off_1 = 0; + float WCSS_off_2 = 0; + float WCSS_off_3 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); + + // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets + // and calculate the wcss as we know their centroids : + + + for (float[] x : so.getRawData()) + { + + calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS_offline_1); + calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); + calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); + + } + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + +//* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS_off_1 = WCSS_off_1 + MapOfIDAandWCSS_offline_1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS_off_2 = WCSS_off_2 + MapOfIDAandWCSS_offline_2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} + + + + System.out.print("wcss1 = " + WCSS1); + System.out.println(" wcss_ofline_1 = " + WCSS_off_1); + + System.out.print("wcss2 = " + WCSS2); + System.out.println(" wcss_ofline_2 = " + WCSS_off_2); + + System.out.print("wcss3 = " + WCSS3); + System.out.println(" wcss_ofline_3 = " + WCSS_off_3); + + + if ((WCSS_off_1 <= WCSS_off_2) && (WCSS_off_1 <= WCSS_off_3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + + + + + // this is to be taken out . only done for hypothesis testing. + + + + + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); + aggloOffline.setWeights(weights2); + this.centroids = aggloOffline.getCentroids(); + + +/* + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + this.centroids = aggloOffline2.getCentroids(); +*/ + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/1D.txt", raw); + k = 6; + RPHashObject o = new SimpleArrayReader(data, 6); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline rphit = new TWRPv6_wcss_offline(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java new file mode 100644 index 0000000..a728a44 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java @@ -0,0 +1,910 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; +import edu.uc.rphash.tests.clusterers.DBScan; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 3 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } +// float wcss = (distancesq(x_r,x_2)/cnt_r) + wcss_1; + +// float wcss = ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_1); + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + +// this method is used to calculate the offline wcss +// UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); + + public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { + + float wcss = wcss_1 + distancesq(x_1,x_2); + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; +// ret[0][0] = cnt_r; +// ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + +// this hash is to calculate the wcss +// hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); + + public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCent [] = MapOfIDAndCent.get(s); + float IncomingVector [] = x; + + + float currentWcss= 0; + + if (IDandWCSS_offline.containsKey(s)) { + currentWcss= IDandWCSS_offline.get(s); + } + + float[][] MergedValues = UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector ); + + float wcss= MergedValues[2][0]; + + + IDandWCSS_offline.put(s, wcss); + + + } + } + return s; + } + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + // this method is used to compute the offline WCSS to choose the best of the clusters + //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); + + void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { + + float[] xt = p.project(x); + + hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); + + } + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + public void printHashmap(HashMap hashmap) { + + //System.out.println(hashmap.keySet()); + System.out.println(hashmap.values()); + + } +public void printStream(Stream> stream) { + + //System.out.println(hashmap.keySet()); + System.out.println(stream.count()); + +} + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + System.out.println("88888888888888888888888888888888888888888888888888888888888888888888888888888"); + printHashmap(denseSetOfIDandCount2_1); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + //printHashmap(denseSetOfIDandCount2_2); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + //printHashmap(denseSetOfIDandCount2_3); + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + float WCSS_off_1 = 0; + float WCSS_off_2 = 0; + float WCSS_off_3 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); + + // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets + // and calculate the wcss as we know their centroids : + + + for (float[] x : so.getRawData()) + { + + calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS_offline_1); + calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); + calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); + + } + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + +//* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS_off_1 = WCSS_off_1 + MapOfIDAandWCSS_offline_1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS_off_2 = WCSS_off_2 + MapOfIDAandWCSS_offline_2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} + + + + System.out.print("wcss1 = " + WCSS1); + System.out.println(" wcss_ofline_1 = " + WCSS_off_1); + + System.out.print("wcss2 = " + WCSS2); + System.out.println(" wcss_ofline_2 = " + WCSS_off_2); + + System.out.print("wcss3 = " + WCSS3); + System.out.println(" wcss_ofline_3 = " + WCSS_off_3); + + + if ((WCSS_off_1 <= WCSS_off_2) && (WCSS_off_1 <= WCSS_off_3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + // this is to be taken out . only done for hypothesis testing. + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + try { + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + Multimap multimapWeightAndCent1 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_1) + { + multimapWeightAndCent1.put((Long)(MapOfIDAndCount1.get(keys)), (float[]) (MapOfIDAndCent1.get(keys))); + } + + Multimap multimapWeightAndCent2 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_2) + { + multimapWeightAndCent2.put((Long)(MapOfIDAndCount2.get(keys)), (float[]) (MapOfIDAndCent2.get(keys))); + } + + Multimap multimapWeightAndCent3 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_3) + { + multimapWeightAndCent3.put((Long)(MapOfIDAndCount3.get(keys)), (float[]) (MapOfIDAndCent3.get(keys))); + } + + Listcentroids1 = new ArrayList<>(); + List weights1 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent1.keys()) + { + weights1.add((float)weights); + } + + for (Long weight : multimapWeightAndCent1.keySet()) + + { + centroids1.addAll(multimapWeightAndCent1.get(weight)); + } + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids1, so.getk()); +// aggloOffline.setWeights(weights1); +// List finalcentroids_1 = aggloOffline.getCentroids(); + + + KMeans2 Offline = new KMeans2(); + Offline.setK(so.getk()); + Offline.setRawData(centroids1); + Offline.setWeights(weights1); + List finalcentroids_1 = Offline.getCentroids(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent2.keys()) + { + weights2.add((float)weights); + } + + for (Long weight : multimapWeightAndCent2.keySet()) + + { + centroids2.addAll(multimapWeightAndCent1.get(weight)); + } + +// Agglomerative3 aggloOffline2 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline2.setWeights(weights2); +// List finalcentroids_2 = aggloOffline2.getCentroids(); + + + KMeans2 Offline2 = new KMeans2(); + Offline2.setK(so.getk()); + Offline2.setRawData(centroids2); + Offline2.setWeights(weights2); + List finalcentroids_2 = Offline2.getCentroids(); + + + Listcentroids3 = new ArrayList<>(); + List weights3 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent3.keys()) + { + weights3.add((float)weights); + } + + for (Long weight : multimapWeightAndCent3.keySet()) + + { + centroids3.addAll(multimapWeightAndCent3.get(weight)); + } + +// Agglomerative3 aggloOffline3 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids3, so.getk()); +// aggloOffline3.setWeights(weights3); +// List finalcentroids_3 = aggloOffline3.getCentroids(); + + + KMeans2 Offline3 = new KMeans2(); + Offline3.setK(so.getk()); + Offline3.setRawData(centroids3); + Offline3.setWeights(weights3); + List finalcentroids_3 = Offline3.getCentroids(); + + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/3runs/OutputTwrpCents_tree1"),finalcentroids_1, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_1, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/3runs/OutputTwrpCents_tree2"),finalcentroids_2, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_2, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/3runs/OutputTwrpCents_tree3"),finalcentroids_3, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_3, data)); + + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + + +// KMeans2 aggloOffline2 = new KMeans2(); +// aggloOffline2.setK(so.getk()); +// aggloOffline2.setRawData(centroids2); +// aggloOffline2.setWeights(weights2); +// this.centroids = aggloOffline2.getCentroids(); + + + DBScan algo = new DBScan(centroids2); + + this.centroids = algo.getCentroids(); + + System.out.println("number of centroids = "+ centroids.size()); + + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 4;//6; + int d = 16;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/3runs/testing" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/gasdrift/1D.txt", raw); + k = 6; + RPHashObject o = new SimpleArrayReader(data, 6); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST rphit = new TWRPv6_wcss_offline2_TEST(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); +// + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java new file mode 100644 index 0000000..75eed62 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java @@ -0,0 +1,1596 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 5 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_10runs implements Clusterer, Runnable { + + boolean znorm = false; + + // convert this to an array of arrays + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + private float[] rngvec6; + private float[] rngvec7; + private float[] rngvec8; + private float[] rngvec9; + private float[] rngvec10; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_10runs(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the largest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + +// this method is used to calculate the offline wcss +// UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); + + public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { + + float wcss = wcss_1 + distancesq(x_1,x_2); + + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + +// this hash is to calculate the wcss +// hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); + + public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCent [] = MapOfIDAndCent.get(s); + float IncomingVector [] = x; + + + float currentWcss= 0; + + if (IDandWCSS_offline.containsKey(s)) { + currentWcss= IDandWCSS_offline.get(s); + } + + float[][] MergedValues = UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector ); + + float wcss= MergedValues[2][0]; + + IDandWCSS_offline.put(s, wcss); + + } + } + return s; + } + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + // this method is used to compute the offline WCSS to choose the best of the clusters + //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); + + void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { + + float[] xt = p.project(x); + + hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); + + } + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + HashMap MapOfIDAndCent6 = new HashMap<>(); + HashMap MapOfIDAndCount6 = new HashMap<>(); + HashMap MapOfIDAndWCSS6 = new HashMap<>(); + + HashMap MapOfIDAndCent7 = new HashMap<>(); + HashMap MapOfIDAndCount7 = new HashMap<>(); + HashMap MapOfIDAndWCSS7 = new HashMap<>(); + + HashMap MapOfIDAndCent8 = new HashMap<>(); + HashMap MapOfIDAndCount8 = new HashMap<>(); + HashMap MapOfIDAndWCSS8 = new HashMap<>(); + + HashMap MapOfIDAndCent9 = new HashMap<>(); + HashMap MapOfIDAndCount9 = new HashMap<>(); + HashMap MapOfIDAndWCSS9 = new HashMap<>(); + + HashMap MapOfIDAndCent10 = new HashMap<>(); + HashMap MapOfIDAndCount10 = new HashMap<>(); + HashMap MapOfIDAndWCSS10 = new HashMap<>(); + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + addtocounter(x, projector, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + + addtocounter(x, projector, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + addtocounter(x, projector, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_6 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount6.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount6.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount6.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_6.put(parent_id, 0L); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_6.remove(parent_id); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_7 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount7.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount7.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount7.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_7.put(parent_id, 0L); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_7.remove(parent_id); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_8 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount8.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount8.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount8.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_8.put(parent_id, 0L); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_8.remove(parent_id); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_9 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount9.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount9.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount9.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_9.put(parent_id, 0L); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_9.remove(parent_id); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_10 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount10.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount10.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount10.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_10.put(parent_id, 0L); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_10.remove(parent_id); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + + Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_6= new ArrayList<>(); + // sort and limit the list + stream2_6.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); + + + + Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_7= new ArrayList<>(); + // sort and limit the list + stream2_7.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); + + + + Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_8= new ArrayList<>(); + // sort and limit the list + stream2_8.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); + + + + Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_9= new ArrayList<>(); + // sort and limit the list + stream2_9.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); + + + + Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_10= new ArrayList<>(); + // sort and limit the list + stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); + + + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; + float WCSS10 = 0; + + float WCSS_off_1 = 0; + float WCSS_off_2 = 0; + float WCSS_off_3 = 0; + float WCSS_off_4 = 0; + float WCSS_off_5 = 0; + float WCSS_off_6 = 0; + float WCSS_off_7 = 0; + float WCSS_off_8 = 0; + float WCSS_off_9 = 0; + float WCSS_off_10 = 0; + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_4 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_5 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_6 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_7 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_8 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_9 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_10 = new HashMap<>(); + + + // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets + // and calculate the wcss as we know their centroids : + + + for (float[] x : so.getRawData()) + { + + calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec , MapOfIDAandWCSS_offline_1); + calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); + calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); + calcWCSSoffline(x, projector, MapOfIDAndCent4, rngvec4, MapOfIDAandWCSS_offline_4); + calcWCSSoffline(x, projector, MapOfIDAndCent5, rngvec5, MapOfIDAandWCSS_offline_5); + + calcWCSSoffline(x, projector, MapOfIDAndCent6, rngvec6 , MapOfIDAandWCSS_offline_6); + calcWCSSoffline(x, projector, MapOfIDAndCent7, rngvec7, MapOfIDAandWCSS_offline_7); + calcWCSSoffline(x, projector, MapOfIDAndCent8, rngvec8, MapOfIDAandWCSS_offline_8); + calcWCSSoffline(x, projector, MapOfIDAndCent9, rngvec9, MapOfIDAandWCSS_offline_9); + calcWCSSoffline(x, projector, MapOfIDAndCent10, rngvec10, MapOfIDAandWCSS_offline_10); + } + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + + for (Long keys: sortedIDList2_6){ + WCSS6 = WCSS6 + MapOfIDAndWCSS6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS7 = WCSS7 + MapOfIDAndWCSS7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS8 = WCSS8 + MapOfIDAndWCSS8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS9 = WCSS9 + MapOfIDAndWCSS9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} + +//* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: + + for (Long keys: sortedIDList2_1) + { WCSS_off_1 = WCSS_off_1 + MapOfIDAandWCSS_offline_1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS_off_2 = WCSS_off_2 + MapOfIDAandWCSS_offline_2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS_off_4 = WCSS_off_4 + MapOfIDAandWCSS_offline_4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS_off_5 = WCSS_off_5 + MapOfIDAandWCSS_offline_5.get(keys);} + + for (Long keys: sortedIDList2_6) + { WCSS_off_6 = WCSS_off_6 + MapOfIDAandWCSS_offline_6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS_off_7 = WCSS_off_7 + MapOfIDAandWCSS_offline_7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS_off_8 = WCSS_off_8 + MapOfIDAandWCSS_offline_8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS_off_9 = WCSS_off_9 + MapOfIDAandWCSS_offline_9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS_off_10 = WCSS_off_10 + MapOfIDAandWCSS_offline_10.get(keys);} + + + + + System.out.print("wcss1 = " + WCSS1); + System.out.println(" wcss_ofline_1 = " + WCSS_off_1); + + System.out.print("wcss2 = " + WCSS2); + System.out.println(" wcss_ofline_2 = " + WCSS_off_2); + + System.out.print("wcss3 = " + WCSS3); + System.out.println(" wcss_ofline_3 = " + WCSS_off_3); + + System.out.print("wcss4 = " + WCSS4); + System.out.println(" wcss_ofline_4 = " + WCSS_off_4); + + System.out.print("wcss5 = " + WCSS5); + System.out.println(" wcss_ofline_5 = " + WCSS_off_5); + + System.out.print("wcss6 = " + WCSS6); + System.out.println(" wcss_ofline_6 = " + WCSS_off_6); + + System.out.print("wcss7 = " + WCSS7); + System.out.println(" wcss_ofline_7 = " + WCSS_off_7); + + System.out.print("wcss8 = " + WCSS8); + System.out.println(" wcss_ofline_8 = " + WCSS_off_8); + + System.out.print("wcss9 = " + WCSS9); + System.out.println(" wcss_ofline_9 = " + WCSS_off_9); + + System.out.print("wcss10 = " + WCSS10); + System.out.println(" wcss_ofline_10 = " + WCSS_off_10); + + + + float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + if (index_of_max == 5) { + MapOfIDAndCount = MapOfIDAndCount6; + MapOfIDAndCent = MapOfIDAndCent6; + MapOfIDAndWCSS = MapOfIDAndWCSS6; + denseSetOfIDandCount2 = denseSetOfIDandCount2_6; + System.out.println("winner = tree6"); + } + if (index_of_max == 6) { + MapOfIDAndCount = MapOfIDAndCount7; + MapOfIDAndCent = MapOfIDAndCent7; + MapOfIDAndWCSS = MapOfIDAndWCSS7; + denseSetOfIDandCount2 = denseSetOfIDandCount2_7; + System.out.println("winner = tree7"); + } + if (index_of_max == 7) { + MapOfIDAndCount = MapOfIDAndCount8; + MapOfIDAndCent = MapOfIDAndCent8; + MapOfIDAndWCSS = MapOfIDAndWCSS8; + denseSetOfIDandCount2 = denseSetOfIDandCount2_8; + System.out.println("winner = tree8"); + } + if (index_of_max == 8) { + MapOfIDAndCount = MapOfIDAndCount9; + MapOfIDAndCent = MapOfIDAndCent9; + MapOfIDAndWCSS = MapOfIDAndWCSS9; + denseSetOfIDandCount2 = denseSetOfIDandCount2_9; + System.out.println("winner = tree9"); + } + if (index_of_max == 9) { + MapOfIDAndCount = MapOfIDAndCount10; + MapOfIDAndCent = MapOfIDAndCent10; + MapOfIDAndWCSS = MapOfIDAndWCSS10; + denseSetOfIDandCount2 = denseSetOfIDandCount2_10; + System.out.println("winner = tree10"); + } + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + // this is to be taken out . only done for hypothesis testing. + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + try { + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + Multimap multimapWeightAndCent1 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_1) + { + multimapWeightAndCent1.put((Long)(MapOfIDAndCount1.get(keys)), (float[]) (MapOfIDAndCent1.get(keys))); + } + + Multimap multimapWeightAndCent2 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_2) + { + multimapWeightAndCent2.put((Long)(MapOfIDAndCount2.get(keys)), (float[]) (MapOfIDAndCent2.get(keys))); + } + + Multimap multimapWeightAndCent3 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_3) + { + multimapWeightAndCent3.put((Long)(MapOfIDAndCount3.get(keys)), (float[]) (MapOfIDAndCent3.get(keys))); + } + + Multimap multimapWeightAndCent4 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_4) + { + multimapWeightAndCent4.put((Long)(MapOfIDAndCount4.get(keys)), (float[]) (MapOfIDAndCent4.get(keys))); + } + + Multimap multimapWeightAndCent5 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_5) + { + multimapWeightAndCent5.put((Long)(MapOfIDAndCount5.get(keys)), (float[]) (MapOfIDAndCent5.get(keys))); + } + + Multimap multimapWeightAndCent6 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_6) + { + multimapWeightAndCent6.put((Long)(MapOfIDAndCount6.get(keys)), (float[]) (MapOfIDAndCent6.get(keys))); + } + Multimap multimapWeightAndCent7 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_7) + { + multimapWeightAndCent7.put((Long)(MapOfIDAndCount7.get(keys)), (float[]) (MapOfIDAndCent7.get(keys))); + } + + Multimap multimapWeightAndCent8 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_8) + { + multimapWeightAndCent8.put((Long)(MapOfIDAndCount8.get(keys)), (float[]) (MapOfIDAndCent8.get(keys))); + } + + Multimap multimapWeightAndCent9 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_9) + { + multimapWeightAndCent9.put((Long)(MapOfIDAndCount9.get(keys)), (float[]) (MapOfIDAndCent9.get(keys))); + } + + Multimap multimapWeightAndCent10 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_10) + { + multimapWeightAndCent10.put((Long)(MapOfIDAndCount10.get(keys)), (float[]) (MapOfIDAndCent10.get(keys))); + } + + + + Listcentroids1 = new ArrayList<>(); + List weights1 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent1.keys()) + { + weights1.add((float)weights); + } + + for (Long weight : multimapWeightAndCent1.keySet()) + + { + centroids1.addAll(multimapWeightAndCent1.get(weight)); + } + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids1, so.getk()); +// aggloOffline.setWeights(weights1); +// List finalcentroids_1 = aggloOffline.getCentroids(); + + KMeans2 Offline = new KMeans2(); + Offline.setK(so.getk()); + Offline.setRawData(centroids1); + Offline.setWeights(weights1); + List finalcentroids_1 = Offline.getCentroids(); + + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent2.keys()) + { + weights2.add((float)weights); + } + + for (Long weight : multimapWeightAndCent2.keySet()) + + { + centroids2.addAll(multimapWeightAndCent1.get(weight)); + } + +// Agglomerative3 aggloOffline2 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline2.setWeights(weights2); +// List finalcentroids_2 = aggloOffline2.getCentroids(); + + KMeans2 Offline2 = new KMeans2(); + Offline2.setK(so.getk()); + Offline2.setRawData(centroids2); + Offline2.setWeights(weights2); + List finalcentroids_2 = Offline2.getCentroids(); + + + Listcentroids3 = new ArrayList<>(); + List weights3 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent3.keys()) + { + weights3.add((float)weights); + } + + for (Long weight : multimapWeightAndCent3.keySet()) + + { + centroids3.addAll(multimapWeightAndCent3.get(weight)); + } + +// Agglomerative3 aggloOffline3 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids3, so.getk()); +// aggloOffline3.setWeights(weights3); +// List finalcentroids_3 = aggloOffline3.getCentroids(); + + KMeans2 Offline3 = new KMeans2(); + Offline3.setK(so.getk()); + Offline3.setRawData(centroids3); + Offline3.setWeights(weights3); + List finalcentroids_3 = Offline3.getCentroids(); + + Listcentroids4 = new ArrayList<>(); + List weights4 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent4.keys()) + { + weights4.add((float)weights); + } + + for (Long weight : multimapWeightAndCent4.keySet()) + + { + centroids4.addAll(multimapWeightAndCent4.get(weight)); + } + +// Agglomerative3 aggloOffline4 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids4, so.getk()); +// aggloOffline4.setWeights(weights4); +// List finalcentroids_4 = aggloOffline4.getCentroids(); + + KMeans2 Offline4 = new KMeans2(); + Offline4.setK(so.getk()); + Offline4.setRawData(centroids4); + Offline4.setWeights(weights4); + List finalcentroids_4 = Offline4.getCentroids(); + + + Listcentroids5 = new ArrayList<>(); + List weights5 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent5.keys()) + { + weights5.add((float)weights); + } + + for (Long weight : multimapWeightAndCent5.keySet()) + + { + centroids5.addAll(multimapWeightAndCent5.get(weight)); + } + +// Agglomerative3 aggloOffline5 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids5, so.getk()); +// aggloOffline5.setWeights(weights5); +// List finalcentroids_5 = aggloOffline5.getCentroids(); + + KMeans2 Offline5 = new KMeans2(); + Offline5.setK(so.getk()); + Offline5.setRawData(centroids5); + Offline5.setWeights(weights5); + List finalcentroids_5 = Offline5.getCentroids(); + + + Listcentroids6 = new ArrayList<>(); + List weights6 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent6.keys()) + { + weights6.add((float)weights); + } + + for (Long weight : multimapWeightAndCent6.keySet()) + + { + centroids6.addAll(multimapWeightAndCent6.get(weight)); + } + +// Agglomerative3 aggloOffline6 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids6, so.getk()); +// aggloOffline6.setWeights(weights6); +// List finalcentroids_6 = aggloOffline6.getCentroids(); + + KMeans2 Offline6 = new KMeans2(); + Offline6.setK(so.getk()); + Offline6.setRawData(centroids6); + Offline6.setWeights(weights6); + List finalcentroids_6 = Offline6.getCentroids(); + + Listcentroids7 = new ArrayList<>(); + List weights7 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent7.keys()) + { + weights7.add((float)weights); + } + + for (Long weight : multimapWeightAndCent7.keySet()) + + { + centroids7.addAll(multimapWeightAndCent7.get(weight)); + } + +// Agglomerative3 aggloOffline7 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids7, so.getk()); +// aggloOffline7.setWeights(weights7); +// List finalcentroids_7 = aggloOffline7.getCentroids(); + + KMeans2 Offline7 = new KMeans2(); + Offline7.setK(so.getk()); + Offline7.setRawData(centroids7); + Offline7.setWeights(weights7); + List finalcentroids_7 = Offline7.getCentroids(); + + + Listcentroids8 = new ArrayList<>(); + List weights8 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent8.keys()) + { + weights8.add((float)weights); + } + + for (Long weight : multimapWeightAndCent8.keySet()) + + { + centroids8.addAll(multimapWeightAndCent8.get(weight)); + } + +// Agglomerative3 aggloOffline8 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids8, so.getk()); +// aggloOffline8.setWeights(weights8); +// List finalcentroids_8 = aggloOffline8.getCentroids(); + + KMeans2 Offline8 = new KMeans2(); + Offline8.setK(so.getk()); + Offline8.setRawData(centroids8); + Offline8.setWeights(weights8); + List finalcentroids_8 = Offline8.getCentroids(); + + + Listcentroids9 = new ArrayList<>(); + List weights9 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent9.keys()) + { + weights9.add((float)weights); + } + + for (Long weight : multimapWeightAndCent9.keySet()) + + { + centroids9.addAll(multimapWeightAndCent9.get(weight)); + } + +// Agglomerative3 aggloOffline9 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids9, so.getk()); +// aggloOffline9.setWeights(weights9); +// List finalcentroids_9 = aggloOffline9.getCentroids(); + + KMeans2 Offline9 = new KMeans2(); + Offline9.setK(so.getk()); + Offline9.setRawData(centroids9); + Offline9.setWeights(weights9); + List finalcentroids_9 = Offline9.getCentroids(); + + + Listcentroids10 = new ArrayList<>(); + List weights10 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent10.keys()) + { + weights10.add((float)weights); + } + + for (Long weight : multimapWeightAndCent10.keySet()) + + { + centroids10.addAll(multimapWeightAndCent10.get(weight)); + } + +// Agglomerative3 aggloOffline10 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids10, so.getk()); +// aggloOffline10.setWeights(weights10); +// List finalcentroids_10 = aggloOffline10.getCentroids(); + + KMeans2 Offline10 = new KMeans2(); + Offline10.setK(so.getk()); + Offline10.setRawData(centroids10); + Offline10.setWeights(weights10); + List finalcentroids_10 = Offline10.getCentroids(); + + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree1"),finalcentroids_1, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_1, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree2"),finalcentroids_2, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_2, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree3"),finalcentroids_3, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_3, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree4"),finalcentroids_4, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_4, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree5"),finalcentroids_5, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_5, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree6"),finalcentroids_6, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_6, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree7"),finalcentroids_7, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_7, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree8"),finalcentroids_8, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_8, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree9"),finalcentroids_9, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_9, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree10"),finalcentroids_10, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_10, data)); + + + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + + rngvec6 = new float[so.getDimparameter()]; + rngvec7 = new float[so.getDimparameter()]; + rngvec8 = new float[so.getDimparameter()]; + rngvec9 = new float[so.getDimparameter()]; + rngvec10 = new float[so.getDimparameter()]; + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + Random r6 = new Random(); + Random r7 = new Random(); + Random r8 = new Random(); + Random r9 = new Random(); + Random r10 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec6[i] = (float) r6.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec7[i] = (float) r7.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec8[i] = (float) r8.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec9[i] = (float) r9.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec10[i] = (float) r10.nextGaussian(); + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + this.centroids = aggloOffline2.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_mainfunc_1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); + k = 6; + RPHashObject o = new SimpleArrayReader(data, 6); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_10runs rphit = new TWRPv6_wcss_offline2_TEST2_10runs(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_5runs.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_5runs.java new file mode 100644 index 0000000..f67fd7e --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_5runs.java @@ -0,0 +1,1062 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 5 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_5runs implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_5runs(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + +// this method is used to calculate the offline wcss +// UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); + + public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { + + float wcss = wcss_1 + distancesq(x_1,x_2); + + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + +// this hash is to calculate the wcss +// hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); + + public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCent [] = MapOfIDAndCent.get(s); + float IncomingVector [] = x; + + + float currentWcss= 0; + + if (IDandWCSS_offline.containsKey(s)) { + currentWcss= IDandWCSS_offline.get(s); + } + + float[][] MergedValues = UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector ); + + float wcss= MergedValues[2][0]; + + IDandWCSS_offline.put(s, wcss); + + } + } + return s; + } + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + // this method is used to compute the offline WCSS to choose the best of the clusters + //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); + + void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { + + float[] xt = p.project(x); + + hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); + + } + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + addtocounter(x, projector, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + + float WCSS_off_1 = 0; + float WCSS_off_2 = 0; + float WCSS_off_3 = 0; + float WCSS_off_4 = 0; + float WCSS_off_5 = 0; + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_4 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_5 = new HashMap<>(); + + // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets + // and calculate the wcss as we know their centroids : + + + for (float[] x : so.getRawData()) + { + + calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec , MapOfIDAandWCSS_offline_1); + calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); + calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); + calcWCSSoffline(x, projector, MapOfIDAndCent4, rngvec4, MapOfIDAandWCSS_offline_4); + calcWCSSoffline(x, projector, MapOfIDAndCent5, rngvec5, MapOfIDAandWCSS_offline_5); + } + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + +//* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: + + for (Long keys: sortedIDList2_1) + { WCSS_off_1 = WCSS_off_1 + MapOfIDAandWCSS_offline_1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS_off_2 = WCSS_off_2 + MapOfIDAandWCSS_offline_2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS_off_4 = WCSS_off_4 + MapOfIDAandWCSS_offline_4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS_off_5 = WCSS_off_5 + MapOfIDAandWCSS_offline_5.get(keys);} + + System.out.print("wcss1 = " + WCSS1); + System.out.println(" wcss_ofline_1 = " + WCSS_off_1); + + System.out.print("wcss2 = " + WCSS2); + System.out.println(" wcss_ofline_2 = " + WCSS_off_2); + + System.out.print("wcss3 = " + WCSS3); + System.out.println(" wcss_ofline_3 = " + WCSS_off_3); + + System.out.print("wcss4 = " + WCSS4); + System.out.println(" wcss_ofline_4 = " + WCSS_off_4); + + System.out.print("wcss5 = " + WCSS5); + System.out.println(" wcss_ofline_5 = " + WCSS_off_5); + + + float arr[] = {WCSS_off_1, WCSS_off_2, WCSS_off_3, WCSS_off_4, WCSS_off_5}; + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + // this is to be taken out . only done for hypothesis testing. + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + try { + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + Multimap multimapWeightAndCent1 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_1) + { + multimapWeightAndCent1.put((Long)(MapOfIDAndCount1.get(keys)), (float[]) (MapOfIDAndCent1.get(keys))); + } + + Multimap multimapWeightAndCent2 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_2) + { + multimapWeightAndCent2.put((Long)(MapOfIDAndCount2.get(keys)), (float[]) (MapOfIDAndCent2.get(keys))); + } + + Multimap multimapWeightAndCent3 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_3) + { + multimapWeightAndCent3.put((Long)(MapOfIDAndCount3.get(keys)), (float[]) (MapOfIDAndCent3.get(keys))); + } + + Multimap multimapWeightAndCent4 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_4) + { + multimapWeightAndCent4.put((Long)(MapOfIDAndCount4.get(keys)), (float[]) (MapOfIDAndCent4.get(keys))); + } + + Multimap multimapWeightAndCent5 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_5) + { + multimapWeightAndCent5.put((Long)(MapOfIDAndCount5.get(keys)), (float[]) (MapOfIDAndCent5.get(keys))); + } + + + Listcentroids1 = new ArrayList<>(); + List weights1 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent1.keys()) + { + weights1.add((float)weights); + } + + for (Long weight : multimapWeightAndCent1.keySet()) + + { + centroids1.addAll(multimapWeightAndCent1.get(weight)); + } + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids1, so.getk()); +// aggloOffline.setWeights(weights1); +// List finalcentroids_1 = aggloOffline.getCentroids(); + + KMeans2 Offline = new KMeans2(); + Offline.setK(so.getk()); + Offline.setRawData(centroids1); + Offline.setWeights(weights1); + List finalcentroids_1 = Offline.getCentroids(); + + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent2.keys()) + { + weights2.add((float)weights); + } + + for (Long weight : multimapWeightAndCent2.keySet()) + + { + centroids2.addAll(multimapWeightAndCent1.get(weight)); + } + +// Agglomerative3 aggloOffline2 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline2.setWeights(weights2); +// List finalcentroids_2 = aggloOffline2.getCentroids(); + + KMeans2 Offline2 = new KMeans2(); + Offline2.setK(so.getk()); + Offline2.setRawData(centroids2); + Offline2.setWeights(weights2); + List finalcentroids_2 = Offline2.getCentroids(); + + + Listcentroids3 = new ArrayList<>(); + List weights3 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent3.keys()) + { + weights3.add((float)weights); + } + + for (Long weight : multimapWeightAndCent3.keySet()) + + { + centroids3.addAll(multimapWeightAndCent3.get(weight)); + } + +// Agglomerative3 aggloOffline3 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids3, so.getk()); +// aggloOffline3.setWeights(weights3); +// List finalcentroids_3 = aggloOffline3.getCentroids(); + + KMeans2 Offline3 = new KMeans2(); + Offline3.setK(so.getk()); + Offline3.setRawData(centroids3); + Offline3.setWeights(weights3); + List finalcentroids_3 = Offline3.getCentroids(); + + Listcentroids4 = new ArrayList<>(); + List weights4 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent4.keys()) + { + weights4.add((float)weights); + } + + for (Long weight : multimapWeightAndCent4.keySet()) + + { + centroids4.addAll(multimapWeightAndCent4.get(weight)); + } + +// Agglomerative3 aggloOffline4 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids4, so.getk()); +// aggloOffline4.setWeights(weights4); +// List finalcentroids_4 = aggloOffline4.getCentroids(); + + KMeans2 Offline4 = new KMeans2(); + Offline4.setK(so.getk()); + Offline4.setRawData(centroids4); + Offline4.setWeights(weights4); + List finalcentroids_4 = Offline4.getCentroids(); + + + Listcentroids5 = new ArrayList<>(); + List weights5 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent5.keys()) + { + weights5.add((float)weights); + } + + for (Long weight : multimapWeightAndCent5.keySet()) + + { + centroids5.addAll(multimapWeightAndCent5.get(weight)); + } + +// Agglomerative3 aggloOffline5 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids5, so.getk()); +// aggloOffline5.setWeights(weights5); +// List finalcentroids_5 = aggloOffline5.getCentroids(); + + KMeans2 Offline5 = new KMeans2(); + Offline5.setK(so.getk()); + Offline5.setRawData(centroids5); + Offline5.setWeights(weights5); + List finalcentroids_5 = Offline5.getCentroids(); + + + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree1"),finalcentroids_1, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_1, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree2"),finalcentroids_2, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_2, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree3"),finalcentroids_3, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_3, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree4"),finalcentroids_4, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_4, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree5"),finalcentroids_5, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_5, data)); + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + this.centroids = aggloOffline2.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_mainfunc_1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); + k = 12; + RPHashObject o = new SimpleArrayReader(data, k); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_5runs rphit = new TWRPv6_wcss_offline2_TEST2_5runs(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java b/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java index 085bb30..0012969 100644 --- a/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java +++ b/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java @@ -55,8 +55,8 @@ public DBScan(List data ) { public List getCentroids() { // to be completed - double eps = 0.35; - int minPoints = 5; + double eps = 2; + int minPoints = 3; DBSCANClusterer db = new DBSCANClusterer(eps , minPoints ); @@ -85,7 +85,10 @@ public List getCentroids() { // to be completed } C.add(new Centroid(floatArray, 0)); // setting the projection id = 0 } + return C; + + } // abstract RPHashObject getParam(); @@ -162,13 +165,17 @@ public boolean setMultiRun(int runs) { public static void main(String[] args) { - GenerateData gen = new GenerateData(3, 1000, 5); // the data generator of rhpash + GenerateData gen = new GenerateData(20,500,5); // the data generator of rhpash + DBScan db = new DBScan (gen.data ); + System.out.println("number of centroids = "+ (db.getCentroids()).size()); for (Centroid iter : db.getCentroids()) { // output centroids float[] toprint = iter.centroid(); + System.out.println("333333333333333"); + System.out.println(Arrays.toString(toprint)); } From 97c2ca85de590bccbf8ec28b075666f92da60cb6 Mon Sep 17 00:00:00 2001 From: deysn Date: Fri, 19 Mar 2021 04:41:03 -0400 Subject: [PATCH 11/29] updating the code with latest runtime parameters. --- .../uc/rphash/TWRPv6_wcss_offline2_TEST.java | 368 +++++++++++------- .../uc/rphash/tests/clusterers/DBScan.java | 18 +- .../rphash/tests/generators/GenerateData.java | 13 +- 3 files changed, 241 insertions(+), 158 deletions(-) diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java index a728a44..e591486 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java @@ -4,6 +4,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; +import java.util.Comparator; //import java.util.Arrays; import java.util.HashMap; //import java.util.Iterator; @@ -14,6 +15,7 @@ import java.util.Random; import java.util.TreeSet; import java.util.stream.Stream; +import java.util.Collections; import edu.uc.rphash.Readers.RPHashObject; import edu.uc.rphash.Readers.SimpleArrayReader; @@ -25,6 +27,8 @@ import edu.uc.rphash.tests.generators.GenerateData; import edu.uc.rphash.util.VectorUtil; import edu.uc.rphash.tests.clusterers.DBScan; +import edu.uc.rphash.tests.clusterers.MultiKMPP; + //import org.apache.commons.collections.map.MultiValueMap; //import org.apache.commons.collections.map.*; @@ -43,6 +47,7 @@ public class TWRPv6_wcss_offline2_TEST implements Clusterer, Runnable { private float[] rngvec; private float[] rngvec2; private float[] rngvec3; + private float eps; private List centroids = null; @@ -78,7 +83,6 @@ public static float distancesq(float[] x, float[] y) { return dist; } - /* /* * X - set of vectors compute the medoid of a vector set @@ -128,10 +132,10 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, } -// this method is used to calculate the offline wcss +// this method is used to calculate the offline wcss // UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); - public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { +/* public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { float wcss = wcss_1 + distancesq(x_1,x_2); @@ -146,7 +150,7 @@ public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,floa return ret; } - +*/ public long hashvec2( float[] xt, float[] x, HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { long s = 1; //fixes leading 0's bug @@ -195,7 +199,7 @@ public long hashvec2( float[] xt, float[] x, // this hash is to calculate the wcss // hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); - public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { +/* public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { long s = 1; //fixes leading 0's bug for (int i = 0; i < xt.length; i++) { s = s << 1 ; // left shift the bits of s by 1. @@ -226,6 +230,7 @@ public long hashvec2_forwcss( float[] xt, float[] x, HashMap Map } return s; } +*/ /* * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid @@ -244,14 +249,14 @@ void addtocounter(float[] x, Projector p, // this method is used to compute the offline WCSS to choose the best of the clusters //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); - void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { +/* void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { float[] xt = p.project(x); hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); } - +*/ static boolean isPowerOfTwo(long num) { return (num & -num) == num; } @@ -259,7 +264,7 @@ static boolean isPowerOfTwo(long num) { public void printHashmap(HashMap hashmap) { - //System.out.println(hashmap.keySet()); + System.out.println(hashmap.keySet()); System.out.println(hashmap.values()); } @@ -269,8 +274,50 @@ public void printStream(Stream> stream) { System.out.println(stream.count()); } - +// this method calculates the epsilon value and prints the information. +public float printInfo(ListsetofKeys, HashMap MapOfIDAndCount, HashMap MapOfIDAndCent, HashMap MapOfIDAndWCSS) { + List counts = new ArrayList<>(); + List wcsseprint = new ArrayList<>(); + float temp = 0; + int elements=0; + float avg=0; + + for (Long keys: setofKeys) + { + elements=elements+1; +//// System.out.println(MapOfIDAndCount.get(keys)); + counts.add(MapOfIDAndCount.get(keys)); + wcsseprint.add(MapOfIDAndWCSS.get(keys)); + + } +// System.out.println(); + System.out.print(counts); + +// for (Long keys: setofKeys) +// { +// System.out.println(MapOfIDAndWCSS.get(keys)); +// wcsseprint.add(MapOfIDAndWCSS.get(keys)); +// } + + // calculation of epsilon + /* + for (int i=0 ; i<(0.8*elements); i++) //for (int i=0 ; i<(0.8*elements); i++) + { + temp = temp + (wcsseprint.get(i))/(counts.get(i)); + } + avg = (float) (temp/(0.8*elements)); + System.out.println(); + System.out.println("\taverage epsilon = "+ avg); + */ + Collections.sort(wcsseprint); + Collections.reverse(wcsseprint); + System.out.println(); + System.out.println(wcsseprint); + System.out.println(); + + return (avg); + } /* * X - data set k - canonical k in k-means l - clustering sub-space Compute @@ -296,8 +343,9 @@ public Multimap findDensityModes2() { Projector projector = so.getProjectionType(); projector.setOrigDim(so.getdim()); projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); -// projector.setRandomSeed(535247432); + //projector.setRandomSeed(949124732); projector.init(); int cutoff = so.getCutoff(); @@ -315,7 +363,8 @@ public Multimap findDensityModes2() { } } - System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + System.out.println("\nNumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + //printHashmap(MapOfIDAndCount1); // next we want to prune the tree by parent count comparison // follows breadthfirst search @@ -356,11 +405,11 @@ public Multimap findDensityModes2() { } } } - HashMap denseSetOfIDandCount2_2 = new HashMap(); for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) { + if (cur_id >so.getk()){ int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); long parent_id = cur_id>>>1; @@ -441,9 +490,7 @@ public Multimap findDensityModes2() { // sort and limit the list stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); - System.out.println("88888888888888888888888888888888888888888888888888888888888888888888888888888"); - printHashmap(denseSetOfIDandCount2_1); - + // printHashmap(denseSetOfIDandCount2_1); Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); @@ -465,36 +512,35 @@ public Multimap findDensityModes2() { float WCSS1 = 0; float WCSS2 = 0; float WCSS3 = 0; - - float WCSS_off_1 = 0; - float WCSS_off_2 = 0; - float WCSS_off_3 = 0; - - + HashMap denseSetOfIDandCount2 = new HashMap(); - - + HashMap MapOfIDAndCent = new HashMap<>(); HashMap MapOfIDAndCount = new HashMap<>(); - HashMap MapOfIDAndWCSS = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + - HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); +/* float WCSS_off_1 = 0; +// float WCSS_off_2 = 0; +// float WCSS_off_3 = 0; + +// HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); +// HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); +// HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets // and calculate the wcss as we know their centroids : - for (float[] x : so.getRawData()) - { +// for (float[] x : so.getRawData()) +// { - calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS_offline_1); - calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); - calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); +// calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS_offline_1); +// calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); +// calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); - } - +// } +*/ //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: @@ -511,7 +557,7 @@ public Multimap findDensityModes2() { for (Long keys: sortedIDList2_3) { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} -//* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: +/* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: for (Long keys: sortedIDList2_1) // for (Long cur_id : (((HashMap) stream2_1).keySet())) { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); @@ -523,28 +569,26 @@ public Multimap findDensityModes2() { // for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) for (Long keys: sortedIDList2_3) - { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} - - - - System.out.print("wcss1 = " + WCSS1); - System.out.println(" wcss_ofline_1 = " + WCSS_off_1); + { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} +*/ - System.out.print("wcss2 = " + WCSS2); - System.out.println(" wcss_ofline_2 = " + WCSS_off_2); + System.out.println("wcss1(online calc) of candidate cents = " + WCSS1); +// System.out.println(" wcss_ofline_calc_1 = " + WCSS_off_1); - System.out.print("wcss3 = " + WCSS3); - System.out.println(" wcss_ofline_3 = " + WCSS_off_3); + System.out.println("wcss1(online calc) of candidate cents = " + WCSS2); +// System.out.println(" wcss_ofline_calc_2 = " + WCSS_off_2); + System.out.println("wcss1(online calc) of candidate cents = " + WCSS3); +// System.out.println(" wcss_ofline_calc_3 = " + WCSS_off_3); - if ((WCSS_off_1 <= WCSS_off_2) && (WCSS_off_1 <= WCSS_off_3)) + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) {MapOfIDAndCount = MapOfIDAndCount1; MapOfIDAndCent = MapOfIDAndCent1; MapOfIDAndWCSS = MapOfIDAndWCSS1; denseSetOfIDandCount2 = denseSetOfIDandCount2_1; System.out.println("winner = tree1"); } - else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) {MapOfIDAndCount = MapOfIDAndCount2; MapOfIDAndCent = MapOfIDAndCent2; MapOfIDAndWCSS = MapOfIDAndWCSS2; @@ -560,19 +604,22 @@ else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) } - System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + System.out.println("NumberOfMicroClusters_AfterPruning_&_beforesortingLimit = "+ denseSetOfIDandCount2.size()); //remove keys with support less than 1 - Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 2); List sortedIDList2= new ArrayList<>(); // sort and limit the list stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + System.out.println("------------------------------------------------------------------------------------------------------------------"); + //printHashmap(denseSetOfIDandCount2); + float eps= printInfo(sortedIDList2,denseSetOfIDandCount2, MapOfIDAndCent,MapOfIDAndWCSS); +// seteps(eps); Multimap multimapWeightAndCent = ArrayListMultimap.create(); - for (Long keys: sortedIDList2) @@ -581,14 +628,19 @@ else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); } - - - // this is to be taken out . only done for hypothesis testing. +/* + // this is to be taken out . only done for hypothesis testing. computing wcss for all the 3 trees. begin: + boolean raw = Boolean.parseBoolean(("raw")); List data = null; try { - data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); + // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; /C:/Users/deysn/Documents/temp/covtype/1D.txt + // "C:/Users/deysn/Desktop/pd_backup/16gb/data_nick2/dim100/1D.txt" + // "C:/Users/deysn/Desktop/temp/dim600/1D.txt" + // "/C:/Users/deysn/Desktop/temp/run_results/3runs/1000noise10/1D.txt" + data = VectorUtil.readFile("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/covtype/covtype_5clus_1D.csv", raw); + } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -628,17 +680,19 @@ else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) centroids1.addAll(multimapWeightAndCent1.get(weight)); } -// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids1, so.getk()); -// aggloOffline.setWeights(weights1); -// List finalcentroids_1 = aggloOffline.getCentroids(); + Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids1, so.getk()); + aggloOffline.setWeights(weights1); + List finalcentroids_1 = aggloOffline.getCentroids(); + +// KMeans2 Offline = new KMeans2(); +// Offline.setK(so.getk()); +// Offline.setRawData(centroids1); +// Offline.setWeights(weights1); +// List finalcentroids_1 = Offline.getCentroids(); - KMeans2 Offline = new KMeans2(); - Offline.setK(so.getk()); - Offline.setRawData(centroids1); - Offline.setWeights(weights1); - List finalcentroids_1 = Offline.getCentroids(); - +// MultiKMPP aggloOffline3 = new MultiKMPP(centroids1,so.getk()); +// List finalcentroids_1 = aggloOffline3.getCentroids(); Listcentroids2 = new ArrayList<>(); List weights2 =new ArrayList<>(); @@ -653,17 +707,18 @@ else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) centroids2.addAll(multimapWeightAndCent1.get(weight)); } -// Agglomerative3 aggloOffline2 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); -// aggloOffline2.setWeights(weights2); -// List finalcentroids_2 = aggloOffline2.getCentroids(); - - - KMeans2 Offline2 = new KMeans2(); - Offline2.setK(so.getk()); - Offline2.setRawData(centroids2); - Offline2.setWeights(weights2); - List finalcentroids_2 = Offline2.getCentroids(); + Agglomerative3 aggloOffline2 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); + aggloOffline2.setWeights(weights2); + List finalcentroids_2 = aggloOffline2.getCentroids(); +// KMeans2 Offline2 = new KMeans2(); +// Offline2.setK(so.getk()); +// Offline2.setRawData(centroids2); +// Offline2.setWeights(weights2); +// List finalcentroids_2 = Offline2.getCentroids(); + + // MultiKMPP aggloOffline2 = new MultiKMPP(centroids2,so.getk()); + // List finalcentroids_2 = aggloOffline2.getCentroids(); Listcentroids3 = new ArrayList<>(); List weights3 =new ArrayList<>(); @@ -678,37 +733,39 @@ else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) centroids3.addAll(multimapWeightAndCent3.get(weight)); } -// Agglomerative3 aggloOffline3 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids3, so.getk()); -// aggloOffline3.setWeights(weights3); -// List finalcentroids_3 = aggloOffline3.getCentroids(); + Agglomerative3 aggloOffline3 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids3, so.getk()); + aggloOffline3.setWeights(weights3); + List finalcentroids_3 = aggloOffline3.getCentroids(); - - KMeans2 Offline3 = new KMeans2(); - Offline3.setK(so.getk()); - Offline3.setRawData(centroids3); - Offline3.setWeights(weights3); - List finalcentroids_3 = Offline3.getCentroids(); - +// KMeans2 Offline3 = new KMeans2(); +// Offline3.setK(so.getk()); +// Offline3.setRawData(centroids3); +// Offline3.setWeights(weights3); +// List finalcentroids_3 = Offline3.getCentroids(); + +// MultiKMPP Offline3 = new MultiKMPP(centroids3,so.getk()); +// List finalcentroids_3 = Offline3.getCentroids(); - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/3runs/OutputTwrpCents_tree1"),finalcentroids_1, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_1, data)); - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/3runs/OutputTwrpCents_tree2"),finalcentroids_2, false); + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/har_k6/OutputTwrpCents_tree1"),finalcentroids_1, false); - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_2, data)); + System.out.printf("kemans for tree1 = "+"%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_1, data)); - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/3runs/OutputTwrpCents_tree3"),finalcentroids_3, false); + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/har_k6/OutputTwrpCents_tree2"),finalcentroids_2, false); - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_3, data)); + System.out.printf("kemans for tree2 = "+"%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_2, data)); + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/har_k6/OutputTwrpCents_tree3"),finalcentroids_3, false); + + System.out.printf("kemans for tree3 = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_3, data) ); + // this is to be taken out . only done for hypothesis testing. computing wcss for all the 3 trees. END +*/ return multimapWeightAndCent; } - - + public void run() { rngvec = new float[so.getDimparameter()]; @@ -721,38 +778,38 @@ public void run() { // Random r = new Random(so.getRandomSeed()); // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + //Random r = new Random(923063597592675214L) ; Random r2 = new Random(); + //Random r2 = new Random(923063597592675214L) ; Random r3 = new Random(); + //Random r3 = new Random(923063597592675214L) ; if (randVect==true){ - for (int i = 0; i < so.getDimparameter(); i++) + for (int i = 0; i < so.getDimparameter(); i++) { rngvec[i] = (float) r.nextGaussian(); - + //System.out.println(rngvec[i]); + } for (int i = 0; i < so.getDimparameter(); i++) rngvec2[i] = (float) r2.nextGaussian(); for (int i = 0; i < so.getDimparameter(); i++) rngvec3[i] = (float) r3.nextGaussian(); - } else { for (int i = 0; i < so.getDimparameter(); i++) rngvec[i] = (float) 0; } - Multimap WeightAndClusters = findDensityModes2(); - Listcentroids2 = new ArrayList<>(); List weights2 =new ArrayList<>(); - System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); - System.out.println("getRandomVector = "+ randVect); +// System.out.println("getRandomVector = "+ randVect); - for (Long weights : WeightAndClusters.keys()) { @@ -768,86 +825,107 @@ public void run() { centroids2.addAll(WeightAndClusters.get(weight)); } - - -// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); -// aggloOffline.setWeights(weights2); -// this.centroids = aggloOffline.getCentroids(); - - - -// KMeans2 aggloOffline2 = new KMeans2(); -// aggloOffline2.setK(so.getk()); -// aggloOffline2.setRawData(centroids2); + + Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); + aggloOffline.setWeights(weights2); + this.centroids = aggloOffline.getCentroids(); + /* + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); // aggloOffline2.setWeights(weights2); -// this.centroids = aggloOffline2.getCentroids(); - - - DBScan algo = new DBScan(centroids2); - - this.centroids = algo.getCentroids(); + this.centroids = aggloOffline2.getCentroids(); */ - System.out.println("number of centroids = "+ centroids.size()); +// MultiKMPP aggloOffline3 = new MultiKMPP(centroids2,so.getk()); +// this.centroids = aggloOffline3.getCentroids(); - +//// DBScan algo = new DBScan(centroids2, (eps/(20)), 3); +//// System.out.println("epsssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss = "+ eps/(20)); +//// this.centroids = algo.getCentroids(); +//// System.out.println("no. of final output centroids = "+ centroids.size()); + } - public static void main(String[] args) throws FileNotFoundException, - IOException { + IOException, InterruptedException { - int k = 4;//6; - int d = 16;//16; - int n = 10000; - float var = 1.5f; - int count = 1; + System.gc(); + + // int k ; //= 10; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; // System.out.printf("ClusterVar\t"); // for (int i = 0; i < count; i++) // System.out.printf("Trial%d\t", i); // System.out.printf("RealWCSS\n"); - - String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/3runs/testing" ; - float f = var; - float avgrealwcss = 0; + String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + + // float f = var; + // float avgrealwcss = 0; float avgtime = 0; // System.out.printf("%f\t", f); // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); - // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // gen.writeCSVToFile(new File("/C:/Users/deysn/Desktop/temp/run_results/3runs/rough/1D.txt")); // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" // RPHashObject o = new SimpleArrayReader(gen.data, k); boolean raw = Boolean.parseBoolean(("raw")); List data = null; - data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/gasdrift/1D.txt", raw); - k = 6; - RPHashObject o = new SimpleArrayReader(data, 6); - + // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; C:/Users/deysn/Documents/temp/covtype/1D.txt + data = VectorUtil.readFile("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/covtype/covtype_5clus_1D.csv", raw); + for (int k=3; k<=3;k++) + { + for (int i = 0; i < 1; i++) + { + //k = 7; + RPHashObject o = new SimpleArrayReader(data, k); o.setDimparameter(16); - o.setCutoff(60); + o.setCutoff(100); //230 o.setRandomVector(true); // System.out.println("cutoff = "+ o.getCutoff()); -// System.out.println("get_random_Vector = "+ o.getRandomVector()); - +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + TWRPv6_wcss_offline2_TEST rphit = new TWRPv6_wcss_offline2_TEST(o); + + System.gc(); + + Runtime rt = Runtime.getRuntime(); + rt.gc(); + Thread.sleep(10); + rt.gc(); + long startmemory = rt.totalMemory() - rt.freeMemory(); long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); - - avgtime += (System.nanoTime() - startTime) / 100000000; + + avgtime += (System.nanoTime() - startTime) / 1000000000f ; + + float usedMB = ((rt.totalMemory() - rt.freeMemory()) - startmemory) / (1024*1024); + + System.out.println(" Time(sec) " + avgtime + ", Mem_Used(MB): " + usedMB/3 ); + + rt.gc(); + Thread.sleep(10); + rt.gc(); // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); - VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); -// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); +// System.out.printf("WCSS for generated data = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("WCSS for Winning Kmeans = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + System.out.println("k is: "+k); // System.gc(); + } + } // System.out.printf("%.0f\n", avgrealwcss / count); @@ -906,5 +984,7 @@ public void setRandomVector(boolean getRandomVector) { this.so.setRandomVector(getRandomVector); } - + public void seteps(float eps) { + this.eps=eps; + } } diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java b/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java index 0012969..e2617da 100644 --- a/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java +++ b/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java @@ -35,16 +35,15 @@ public DBScan() { } - /* public DBScan(List , double eps , int minPoints) { + public DBScan(List data , double eps , int minPoints) { this.setRawData(data); - this.setEps(eps); - this.setminpoints(minPoints); - + this.eps = eps; + this.minPoints = minPoints; } - */ + public DBScan(List data ) { @@ -55,8 +54,8 @@ public DBScan(List data ) { public List getCentroids() { // to be completed - double eps = 2; - int minPoints = 3; + //double eps = 6; + //int minPoints = 4; DBSCANClusterer db = new DBSCANClusterer(eps , minPoints ); @@ -168,7 +167,10 @@ public static void main(String[] args) { GenerateData gen = new GenerateData(20,500,5); // the data generator of rhpash - DBScan db = new DBScan (gen.data ); + DBScan db = new DBScan (gen.data, 1 , 2 ); + + System.out.println("minpoints = "+ (db.minPoints)); + System.out.println("eps = "+ (db.eps)); System.out.println("number of centroids = "+ (db.getCentroids()).size()); diff --git a/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java b/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java index 5e81e50..3d643f1 100644 --- a/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java +++ b/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java @@ -569,11 +569,11 @@ public static void main(String[] args) throws NumberFormatException, List truncatedArgs = new ArrayList(); Map taggedArgs = argsUI(args, truncatedArgs); - int k = 10; - int d = 1000; - int n = 20000; - float var = 1f; - float sparseness = 1f; + int k = 20; + int d = 200; + int n = 10000; + float var = 0.8f; //1.0f; + float sparseness = 1.0f; //1f; boolean shuffle = true; boolean raw = false; @@ -585,7 +585,8 @@ public static void main(String[] args) throws NumberFormatException, if(taggedArgs.containsKey("shuffled"))shuffle = Boolean.parseBoolean(taggedArgs.get("shuffled")); if(taggedArgs.containsKey("raw"))raw = Boolean.parseBoolean(taggedArgs.get("raw")); - File outputFile = new File(args[0]+"_"+k+"x"+d+"x"+n+".mat"); + //File outputFile = new File(args[0]+"_"+k+"x"+d+"x"+n+".txt"); // ".mat" + File outputFile = new File(args[0] +"1D"+".txt"); File lblFile = new File(args[0]+"_"+k+"x"+d+"x"+n+".lbl"); System.out.printf("k=%d, n=%d, d=%d, var=%f, sparseness=%f %s > %s",k,n, From c8dc896b72956c0f93662c64fbf006c50f336ec4 Mon Sep 17 00:00:00 2001 From: deysn Date: Fri, 19 Mar 2021 05:45:46 -0400 Subject: [PATCH 12/29] updating --- src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java index e591486..35a0b8c 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java @@ -909,7 +909,7 @@ public static void main(String[] args) throws FileNotFoundException, float usedMB = ((rt.totalMemory() - rt.freeMemory()) - startmemory) / (1024*1024); - System.out.println(" Time(sec) " + avgtime + ", Mem_Used(MB): " + usedMB/3 ); + System.out.println(" Time( in sec) " + avgtime + ", Mem_Used(MB): " + usedMB/3 ); rt.gc(); Thread.sleep(10); From 310138ceb213ed2b4b7fbffd66cde67c187dabee Mon Sep 17 00:00:00 2001 From: deysn Date: Fri, 19 Mar 2021 06:26:07 -0400 Subject: [PATCH 13/29] testing merge --- src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java index 35a0b8c..48c5297 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java @@ -909,7 +909,7 @@ public static void main(String[] args) throws FileNotFoundException, float usedMB = ((rt.totalMemory() - rt.freeMemory()) - startmemory) / (1024*1024); - System.out.println(" Time( in sec) " + avgtime + ", Mem_Used(MB): " + usedMB/3 ); + System.out.println(" Time(in sec) " + avgtime + ", Mem_Used(MB): " + usedMB/3 ); rt.gc(); Thread.sleep(10); From d45b34121591009563db28f5f607463e50ee95a2 Mon Sep 17 00:00:00 2001 From: deysn Date: Fri, 19 Mar 2021 13:51:44 -0400 Subject: [PATCH 14/29] added back the for computation the partial wcss error. --- src/main/java/edu/uc/rphash/TWRPv4.java | 42 ++++++++++++++++ src/main/java/edu/uc/rphash/TWRPv5_WCSS.java | 49 ++++++++++++++++++- .../uc/rphash/TWRPv6_wcss_offline2_TEST.java | 4 +- .../java/edu/uc/rphash/tests/StatTests.java | 1 + 4 files changed, 93 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/uc/rphash/TWRPv4.java b/src/main/java/edu/uc/rphash/TWRPv4.java index d0bad1f..df32d4d 100644 --- a/src/main/java/edu/uc/rphash/TWRPv4.java +++ b/src/main/java/edu/uc/rphash/TWRPv4.java @@ -70,6 +70,48 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float cnt_2, float[] x_2) { float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + float[] var_r1 = new float[x_1.length]; + float[] var_r2 = new float[x_1.length]; + + double var1=0; + double var2=0; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; + + var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; + + } + + for (int i = 0; i < var_r1.length; i++) { + var1 = var1 + var_r1[i]; + + var2 = var2 + var_r2[i]; + } + + + // System.out.println("wcsse = " + wcsse); + + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + return ret; + } + + + public static float[][] UpdateHashMap_actual(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; float[] x_r = new float[x_1.length]; diff --git a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java index 54d3f7a..a62b740 100644 --- a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java +++ b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java @@ -98,6 +98,53 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, float cnt_2, float[] x_2 , float wcss_2) { float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + float[] var_r1 = new float[x_1.length]; + float[] var_r2 = new float[x_1.length]; + + double var1=0; + double var2=0; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; + + var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; + + + } + + for (int i = 0; i < var_r1.length; i++) { + var1 = var1 + var_r1[i]; + + var2 = var2 + var_r2[i]; + } + double wcsse=0; + wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; + + // System.out.println("wcsse = " + wcsse); + + float wcss = (float) wcsse; + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + public static float[][] UpdateHashMap_actual(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; float[] x_r = new float[x_1.length]; @@ -154,7 +201,7 @@ public long hashvec2( float[] xt, float[] x, float incomingWcss= 0; - float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + float[][] MergedValues = UpdateHashMap_actual(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); Long UpdatedCount = (long) MergedValues[0][0] ; diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java index 48c5297..e2da9cd 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java @@ -861,7 +861,7 @@ public static void main(String[] args) throws FileNotFoundException, // System.out.printf("Trial%d\t", i); // System.out.printf("RealWCSS\n"); - String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + // String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; // float f = var; // float avgrealwcss = 0; @@ -916,7 +916,7 @@ public static void main(String[] args) throws FileNotFoundException, rt.gc(); // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); - + String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); // System.out.printf("WCSS for generated data = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); diff --git a/src/main/java/edu/uc/rphash/tests/StatTests.java b/src/main/java/edu/uc/rphash/tests/StatTests.java index 6d9aecd..e63a336 100644 --- a/src/main/java/edu/uc/rphash/tests/StatTests.java +++ b/src/main/java/edu/uc/rphash/tests/StatTests.java @@ -110,6 +110,7 @@ public static double WCSSECentroidsFloat(List estCentroids, List Date: Thu, 3 Jun 2021 15:00:16 -0400 Subject: [PATCH 15/29] added python scripts to measure ari and wcss for large data added classes for knee finding, tracking centroids and aging --- .classpath | 8 +- scripts/ari_test.py | 64 ++++ scripts/knee_test.py | 350 ++++++++++++++++++ scripts/measures_wcss.py | 87 +++++ .../uc/rphash/TWRPv6_wcss_offline2_TEST.java | 71 ++-- .../edu/uc/rphash/aging/ageCentriods.java | 16 + .../java/edu/uc/rphash/aging/ageVectors.java | 15 + .../centroidTracker/trackCentroids.java | 37 ++ .../edu/uc/rphash/kneefinder/findknee.java | 25 ++ 9 files changed, 648 insertions(+), 25 deletions(-) create mode 100644 scripts/ari_test.py create mode 100644 scripts/knee_test.py create mode 100644 scripts/measures_wcss.py create mode 100644 src/main/java/edu/uc/rphash/aging/ageCentriods.java create mode 100644 src/main/java/edu/uc/rphash/aging/ageVectors.java create mode 100644 src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java create mode 100644 src/main/java/edu/uc/rphash/kneefinder/findknee.java diff --git a/.classpath b/.classpath index 0e623b8..0c97049 100644 --- a/.classpath +++ b/.classpath @@ -1,9 +1,13 @@ - + + + + + - + diff --git a/scripts/ari_test.py b/scripts/ari_test.py new file mode 100644 index 0000000..4aef401 --- /dev/null +++ b/scripts/ari_test.py @@ -0,0 +1,64 @@ +import pandas as pd +import numpy as np +#from scipy.spatial import distance +from math import dist +import os +import csv +import openpyxl +from sklearn.metrics.cluster import adjusted_rand_score + +# https://github.com/cran/dendextend/blob/master/R/find_k.R +# https://cran.r-project.org/web/packages/fpc/fpc.pdf + +# scipy.spatial.distance.euclidean(A, B) +# dist([1, 0, 0], [0, 1, 0]) + +labels_true_gt=np.genfromtxt("C:/Users/dey.sn/Downloads/temp/haraal/haraal_labels_gt.csv", delimiter=',') +print(labels_true_gt.shape[0]) +print(labels_true_gt) +#column = nArr2D[:, 1] +#output_labels = np.genfromtxt('C:/Users/dey.sn/Downloads/work/output/har_k6/Labels_har_k6_kmpp,cutoff,90,k6.csv', delimiter=',') +''' +output_labels_col1=output_labels[:,0] +print(output_labels.shape[1]) +print(output_labels_col1) +for cols in range(output_labels.shape[1]): + print(adjusted_rand_score(labels_true_gt,output_labels[:,cols])) + +''' +# This is the path where you want to search +path = r'C:/Users/dey.sn/Downloads/work/output/haraal_k6/' +# this is the extension you want to detect +extension = '.csv' +substring="Labels" +count=0 +wb=openpyxl.Workbook() +sheet=wb.active +sheet.title= 'haraal_ari' +for root, dirs_list, files_list in os.walk(path): + for file_name in files_list: + if os.path.splitext(file_name)[-1] == extension: + file_name_path = os.path.join(root, file_name) + print(file_name) + print(file_name_path) # This is the full path of the filter file + try: + index=file_name.index(substring) + # print(index) + if(index==0): + count+=1 + output_labels = np.genfromtxt(file_name_path, delimiter=',') + b = sheet.cell(row=count, column=2) + b.value = file_name + for cols in range(output_labels.shape[1]): + ari=adjusted_rand_score(labels_true_gt,output_labels[:,cols]) + print(ari) + c = sheet.cell(row=count, column=(cols+12)) + c.value = ari + except ValueError: + print( + "Not found!") + else: + print( + "Found!") +print(count) +wb.save("C:/Users/dey.sn/Downloads/work/output/haraal_k6/results_python_ari_all_runs.xlsx") diff --git a/scripts/knee_test.py b/scripts/knee_test.py new file mode 100644 index 0000000..f5d1b68 --- /dev/null +++ b/scripts/knee_test.py @@ -0,0 +1,350 @@ +import numpy as np +from scipy import interpolate +from scipy.signal import argrelextrema +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression +import warnings +from typing import Tuple, Optional, Iterable +import matplotlib.pyplot as plt +import pandas as pd + + + + +class KneeLocator(object): + def __init__( + self, + x: Iterable[float], + y: Iterable[float], + S: float = 1.0, + curve: str = "concave", + direction: str = "increasing", + interp_method: str = "interp1d", + online: bool = False, + ): + """ + Once instantiated, this class attempts to find the point of maximum + curvature on a line. The knee is accessible via the `.knee` attribute. + :param x: x values. + :param y: y values. + :param S: Sensitivity, original paper suggests default of 1.0 + :param curve: If 'concave', algorithm will detect knees. If 'convex', it + will detect elbows. + :param direction: one of {"increasing", "decreasing"} + :param interp_method: one of {"interp1d", "polynomial"} + :param online: Will correct old knee points if True, will return first knee if False + """ + # Step 0: Raw Input + self.x = np.array(x) + self.y = np.array(y) + self.curve = curve + self.direction = direction + self.N = len(self.x) + self.S = S + self.all_knees = set() + self.all_norm_knees = set() + self.all_knees_y = [] + self.all_norm_knees_y = [] + self.online = online + + # Step 1: fit a smooth line + if interp_method == "interp1d": + uspline = interpolate.interp1d(self.x, self.y) + self.Ds_y = uspline(self.x) + elif interp_method == "polynomial": + pn_model = PolynomialFeatures(7) + xpn = pn_model.fit_transform(self.x.reshape(-1, 1)) + regr_model = LinearRegression() + regr_model.fit(xpn, self.y) + self.Ds_y = regr_model.predict( + pn_model.fit_transform(self.x.reshape(-1, 1)) + ) + else: + raise ValueError( + "{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format( + interp_method + ) + ) + + # Step 2: normalize values + self.x_normalized = self.__normalize(self.x) + self.y_normalized = self.__normalize(self.Ds_y) + + # Step 3: Calculate the Difference curve + self.x_normalized, self.y_normalized = self.transform_xy( + self.x_normalized, self.y_normalized, self.direction, self.curve + ) + # normalized difference curve + self.y_difference = self.y_normalized - self.x_normalized + self.x_difference = self.x_normalized.copy() + + # Step 4: Identify local maxima/minima + # local maxima + self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0] + self.x_difference_maxima = self.x_difference[self.maxima_indices] + self.y_difference_maxima = self.y_difference[self.maxima_indices] + + # local minima + self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0] + self.x_difference_minima = self.x_difference[self.minima_indices] + self.y_difference_minima = self.y_difference[self.minima_indices] + + # Step 5: Calculate thresholds + self.Tmx = self.y_difference_maxima - ( + self.S * np.abs(np.diff(self.x_normalized).mean()) + ) + + # Step 6: find knee + self.knee, self.norm_knee = self.find_knee() + + # Step 7: If we have a knee, extract data about it + self.knee_y = self.norm_knee_y = None + if self.knee: + self.knee_y = self.y[self.x == self.knee][0] + self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0] + + @staticmethod + def __normalize(a: Iterable[float]) -> Iterable[float]: + """normalize an array + :param a: The array to normalize + """ + return (a - min(a)) / (max(a) - min(a)) + + @staticmethod + def transform_xy( + x: Iterable[float], y: Iterable[float], direction: str, curve: str + ) -> Tuple[Iterable[float], Iterable[float]]: + """transform x and y to concave, increasing based on given direction and curve""" + # convert elbows to knees + if curve == "convex": + x = x.max() - x + y = y.max() - y + # flip decreasing functions to increasing + if direction == "decreasing": + y = np.flip(y, axis=0) + + if curve == "convex": + x = np.flip(x, axis=0) + y = np.flip(y, axis=0) + + return x, y + + def find_knee(self,): + """This function finds and sets the knee value and the normalized knee value. """ + if not self.maxima_indices.size: + warnings.warn( + "No local maxima found in the difference curve\n" + "The line is probably not polynomial, try plotting\n" + "the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n" + "Also check that you aren't mistakenly setting the curve argument", + RuntimeWarning, + ) + return None, None + + # placeholder for which threshold region i is located in. + maxima_threshold_index = 0 + minima_threshold_index = 0 + # traverse the difference curve + for i, x in enumerate(self.x_difference): + # skip points on the curve before the the first local maxima + if i < self.maxima_indices[0]: + continue + + j = i + 1 + + # reached the end of the curve + if x == 1.0: + break + + # if we're at a local max, increment the maxima threshold index and continue + if (self.maxima_indices == i).any(): + threshold = self.Tmx[maxima_threshold_index] + threshold_index = i + maxima_threshold_index += 1 + # values in difference curve are at or after a local minimum + if (self.minima_indices == i).any(): + threshold = 0.0 + minima_threshold_index += 1 + + if self.y_difference[j] < threshold: + if self.curve == "convex": + if self.direction == "decreasing": + knee = self.x[threshold_index] + norm_knee = self.x_normalized[threshold_index] + else: + knee = self.x[-(threshold_index + 1)] + norm_knee = self.x_normalized[-(threshold_index + 1)] + + elif self.curve == "concave": + if self.direction == "decreasing": + knee = self.x[-(threshold_index + 1)] + norm_knee = self.x_normalized[-(threshold_index + 1)] + else: + knee = self.x[threshold_index] + norm_knee = self.x_normalized[threshold_index] + + # add the y value at the knee + y_at_knee = self.y[self.x == knee][0] + y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0] + if knee not in self.all_knees: + self.all_knees_y.append(y_at_knee) + self.all_norm_knees_y.append(y_norm_at_knee) + + # now add the knee + self.all_knees.add(knee) + self.all_norm_knees.add(norm_knee) + + # if detecting in offline mode, return the first knee found + if self.online is False: + return knee, norm_knee + + if self.all_knees == set(): + warnings.warn("No knee/elbow found") + return None, None + + return knee, norm_knee + + def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None): + """Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists. + + :param figsize: Optional[Tuple[int, int] + The figure size of the plot. Example (12, 8) + :return: NoReturn + """ + import matplotlib.pyplot as plt + + if figsize is None: + figsize = (6, 6) + + plt.figure(figsize=figsize) + plt.title("Normalized Knee Point") + plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve") + plt.plot(self.x_difference, self.y_difference, "r", label="difference curve") + plt.xticks( + np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1) + ) + plt.yticks( + np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1) + ) + + plt.vlines( + self.norm_knee, + plt.ylim()[0], + plt.ylim()[1], + linestyles="--", + label="knee/elbow", + ) + plt.legend(loc="best") + + def plot_knee(self, figsize: Optional[Tuple[int, int]] = None): + """ + Plot the curve and the knee, if it exists + + :param figsize: Optional[Tuple[int, int] + The figure size of the plot. Example (12, 8) + :return: NoReturn + """ + import matplotlib.pyplot as plt + + if figsize is None: + figsize = (6, 6) + + plt.figure(figsize=figsize) + plt.title("Knee Point") + plt.plot(self.x, self.y, "b", label="data") + plt.vlines( + self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow" + ) + plt.legend(loc="best") + + # Niceties for users working with elbows rather than knees + @property + def elbow(self): + return self.knee + + @property + def norm_elbow(self): + return self.norm_knee + + @property + def elbow_y(self): + return self.knee_y + + @property + def norm_elbow_y(self): + return self.norm_knee_y + + @property + def all_elbows(self): + return self.all_knees + + @property + def all_norm_elbows(self): + return self.all_norm_knees + + @property + def all_elbows_y(self): + return self.all_knees_y + + @property + def all_norm_elbows_y(self): + return self.all_norm_knees_y + + +## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + +import pandas as pd +import timeit + +#df=pd.read_excel("C:/Users/dey.sn/Downloads/work/output/elbow_graph_stage1_syn_data.xlsx") +#df=pd.read_excel("data.xlsx", sheet_name='har2', header=None, na_values=['NA'], usecols="Aq,at",skiprows=range(97),nrows=6) +df=pd.read_excel("C:/Users/dey.sn/Downloads/work/output/elbow_graph_stage1_syn_data.xlsx", sheet_name='N5%_1000', header=None, na_values=['NA'], usecols="A,y",skiprows=range(3),nrows=99) +#print(df) +conv_arr= df.values +start = timeit.default_timer() + +#split matrix into 3 columns each into 1d array +#print(conv_arr.shape) +#print(conv_arr[1,1]) +arr1 = np.delete(conv_arr,1,axis=1) +arr2 = np.delete(conv_arr,0,axis=1) + +#converting into 1D array +x = arr1.ravel() +y = arr2.ravel() + +kn = KneeLocator(list(x), y , S=0.0, curve='convex', direction='decreasing',online=False ) #,interp_method='polynomial') +stop = timeit.default_timer() +print('Time: ', stop - start) +kn2 = KneeLocator(list(x), y , S=1.0, curve='convex', direction='decreasing',online=False ) +print(kn.knee) +print(kn2.knee) +#print(kn.norm_knee) + +plt.style.use('ggplot') +plt.plot() +plt.xlabel('K (no. of clusters) ') +plt.ylabel('WCSSE') +#plt.title('Elbow method for optimal k.[data=HAR, k=4, Pred. k= %d]' %(kn.knee)) +plt.suptitle('Elbow Method For Optimal Cluster Determination [data=Noise_30_percent, K=10, Pred.K = %d]' %(kn.knee),x=0.5, y=0.000, ha="center" , va="bottom") +plt.plot(x, y, 'bx-') +#plt.xscale('log') +plt.grid(True) +plt.xticks() +plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') +plt.savefig("C:/Users/dey.sn/Downloads/work/output/N30%_1000_graph_s0.pdf") +plt.show() + +plt.style.use('ggplot') +plt.plot() +plt.xlabel('Buckets') +plt.ylabel('Counts') +plt.title('Elbow method for optimal k. [data=Noise_30_percent, K=10, Pred.K = %d]' %(kn2.knee)) +plt.plot(x, y, 'bx-') +#plt.xscale('log') +plt.grid(True) +plt.xticks() +plt.vlines(kn2.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') +plt.savefig("C:/Users/dey.sn/Downloads/work/output/N30%_1000_graph_s1.pdf") +plt.show() \ No newline at end of file diff --git a/scripts/measures_wcss.py b/scripts/measures_wcss.py new file mode 100644 index 0000000..17d4bdb --- /dev/null +++ b/scripts/measures_wcss.py @@ -0,0 +1,87 @@ +import pandas as pd +import numpy as np +#from scipy.spatial import distance +from math import dist +import os +import csv +import openpyxl +from sklearn.metrics.cluster import adjusted_rand_score + +# https://github.com/cran/dendextend/blob/master/R/find_k.R +# https://cran.r-project.org/web/packages/fpc/fpc.pdf + +# scipy.spatial.distance.euclidean(A, B) +# dist([1, 0, 0], [0, 1, 0]) + +data=np.genfromtxt("C:/Users/dey.sn/Downloads/temp/haraal/2d.csv", delimiter=',') +print(data.shape[0]) +#print(data[10298]) +vectors=data.shape[0] + +# This is the path where you want to search +path = r'C:/Users/dey.sn/Downloads/work/output/haraal_k6/' +# this is the extension you want to detect +extension = '.csv' +substring="haraal_k6" +count=0 +wb=openpyxl.Workbook() +sheet=wb.active +sheet.title= 'haraal' +for root, dirs_list, files_list in os.walk(path): + for file_name in files_list: + if os.path.splitext(file_name)[-1] == extension: + file_name_path = os.path.join(root, file_name) + print(file_name) + print(file_name_path) # This is the full path of the filter file + try: + index=file_name.index(substring) + # print(index) + if(index==0): + count+=1 + centarr = np.genfromtxt(file_name_path, delimiter=',') + b = sheet.cell(row=count, column=2) + b.value = file_name +# centarr = np.genfromtxt('C:/Users/dey.sn/Downloads/work/output/har_k6/har_k6_kmeans_120cutoff _4_2.csv', delimiter=',') +# print(np.shape(centarr)) +# print(centarr[0],centarr[1]) + index = 2 + row=int(centarr[0]) # number of centroids + col=int(centarr[1]) + cents=[] + for i in range(row): + c1=[] + for j in range(col): + c1.append(centarr[index]) + index += 1 + cents.append(c1) + +# print(cents[2]) +# print(np.shape(cents)) + + wcss1=0 + for i in range (vectors): + distance1 = [] + for j in range(row): +# print(j) + d1=(dist(data[i], cents[j])) + #print(d1) + distance1.append(d1) + + print(distance1) + mindist=min(distance1) + print(mindist) + + wcss1= int(wcss1 + (mindist*mindist)) + + print("wcss1 is : " , (wcss1)) + + c = sheet.cell(row=count, column=12) + c.value = wcss1 + except ValueError: + print + "Not found!" + else: + print + "Found!" +print(count) +wb.save("C:/Users/dey.sn/Downloads/work/output/haraal_k6/results_python_wcss_all_runs.xlsx") diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java index e2da9cd..9138c3f 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java @@ -36,13 +36,30 @@ import com.google.common.collect.Multimap; +// https://www.javatips.net/api/webofneeds-master/webofneeds/won-matcher-solr/src/main/java/won/matcher/solr/utils/Kneedle.java +// https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java // this algorithm runs twrp 3 times : (only the random bisection vector varies, the Projection matrix remains same) // and selects the one which has the best wcss offline for the 10X candidate centroids. public class TWRPv6_wcss_offline2_TEST implements Clusterer, Runnable { - boolean znorm = false; + List labels; // to directly output labels + HashMap labelmap; // to directly output labels + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + + + boolean znorm = false; private int counter; private float[] rngvec; private float[] rngvec2; @@ -363,7 +380,7 @@ public Multimap findDensityModes2() { } } - System.out.println("\nNumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + System.out.println("\nNumberOfMicroClustersBeforePruning = , "+ MapOfIDAndCent1.size()); //printHashmap(MapOfIDAndCount1); // next we want to prune the tree by parent count comparison @@ -572,13 +589,13 @@ public Multimap findDensityModes2() { { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} */ - System.out.println("wcss1(online calc) of candidate cents = " + WCSS1); + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS1); // System.out.println(" wcss_ofline_calc_1 = " + WCSS_off_1); - System.out.println("wcss1(online calc) of candidate cents = " + WCSS2); + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS2); // System.out.println(" wcss_ofline_calc_2 = " + WCSS_off_2); - System.out.println("wcss1(online calc) of candidate cents = " + WCSS3); + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS3); // System.out.println(" wcss_ofline_calc_3 = " + WCSS_off_3); if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) @@ -604,7 +621,7 @@ else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) } - System.out.println("NumberOfMicroClusters_AfterPruning_&_beforesortingLimit = "+ denseSetOfIDandCount2.size()); + System.out.println("NumberOfMicroClusters_AfterPruning_&_beforesortingLimit = , "+ denseSetOfIDandCount2.size()); //remove keys with support less than 1 Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 2); @@ -807,7 +824,7 @@ public void run() { Listcentroids2 = new ArrayList<>(); List weights2 =new ArrayList<>(); - System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("\tNumberOfMicroClusters_AfterPruning = , "+ WeightAndClusters.size()); // System.out.println("getRandomVector = "+ randVect); for (Long weights : WeightAndClusters.keys()) @@ -826,15 +843,15 @@ public void run() { } - Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); - aggloOffline.setWeights(weights2); - this.centroids = aggloOffline.getCentroids(); - /* +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + KMeans2 aggloOffline2 = new KMeans2(); aggloOffline2.setK(so.getk()); aggloOffline2.setRawData(centroids2); // aggloOffline2.setWeights(weights2); - this.centroids = aggloOffline2.getCentroids(); */ + this.centroids = aggloOffline2.getCentroids(); // MultiKMPP aggloOffline3 = new MultiKMPP(centroids2,so.getk()); // this.centroids = aggloOffline3.getCentroids(); @@ -877,16 +894,20 @@ public static void main(String[] args) throws FileNotFoundException, boolean raw = Boolean.parseBoolean(("raw")); List data = null; // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; C:/Users/deysn/Documents/temp/covtype/1D.txt - data = VectorUtil.readFile("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/covtype/covtype_5clus_1D.csv", raw); - for (int k=3; k<=3;k++) + // C:/Users/dey.sn/Downloads/temp/covtype/1D.csv ; "C:/Users/dey.sn/Downloads/temp/run_results/3runs/har_k6/1D.txt" + String inputfile = "C:/Users/dey.sn/Downloads/temp/crop_mapping/1D.csv" ; + System.out.println(inputfile); + data = VectorUtil.readFile( inputfile , raw); + for (int k=4; k<=11;k++) { - for (int i = 0; i < 1; i++) + for (int i = 1; i <= 3; i++) { //k = 7; + RPHashObject o = new SimpleArrayReader(data, k); - + o.setDimparameter(16); - o.setCutoff(100); //230 + o.setCutoff(130); //230 o.setRandomVector(true); // System.out.println("cutoff = "+ o.getCutoff()); @@ -906,22 +927,26 @@ public static void main(String[] args) throws FileNotFoundException, List centsr = rphit.getCentroids(); avgtime += (System.nanoTime() - startTime) / 1000000000f ; - + float usedMB = ((rt.totalMemory() - rt.freeMemory()) - startmemory) / (1024*1024); - System.out.println(" Time(in sec) " + avgtime + ", Mem_Used(MB): " + usedMB/3 ); + System.out.println(" Time(in sec), " + avgtime + ", Mem_Used(MB):, " + (usedMB/3) ); rt.gc(); Thread.sleep(10); rt.gc(); // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); - String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; - VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); +// String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + String Output = "C:/Users/dey.sn/Downloads/work/output/cropmap_k7/cropmap_k7_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// VectorUtil.writeVectorFile(new File(Output+"_"+"labels"+".txt"), centsr.getLabels()); + // System.out.printf("WCSS for generated data = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); - System.out.printf("WCSS for Winning Kmeans = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); - System.out.println("k is: "+k); + System.out.printf(",WCSS for Winning Kmeans, = , "+ "%.0f ", StatTests.WCSSECentroidsFloat(centsr, data)); + System.out.println(",k, is: , "+k); // System.gc(); } diff --git a/src/main/java/edu/uc/rphash/aging/ageCentriods.java b/src/main/java/edu/uc/rphash/aging/ageCentriods.java new file mode 100644 index 0000000..775189e --- /dev/null +++ b/src/main/java/edu/uc/rphash/aging/ageCentriods.java @@ -0,0 +1,16 @@ +package edu.uc.rphash.aging; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; + + +public class ageCentriods implements Runnable { + + @Override + public void run() { + // TODO Auto-generated method stub + + } + + +} diff --git a/src/main/java/edu/uc/rphash/aging/ageVectors.java b/src/main/java/edu/uc/rphash/aging/ageVectors.java new file mode 100644 index 0000000..485859e --- /dev/null +++ b/src/main/java/edu/uc/rphash/aging/ageVectors.java @@ -0,0 +1,15 @@ +package edu.uc.rphash.aging; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; + + +public class ageVectors implements Runnable { + + + @Override + public void run() { + + } + +} diff --git a/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java b/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java new file mode 100644 index 0000000..70a31bd --- /dev/null +++ b/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java @@ -0,0 +1,37 @@ +package edu.uc.rphash.centroidTracker; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; +import edu.uc.rphash.lsh.LSH; + +public class trackCentroids implements Runnable { + + private float[] vec; + + + public trackCentroids(float[] vec, LSH[] lshfuncs) { + + } + + static float[] scale(float[] t, float s) { + float[] ret = new float[t.length]; + for (int i = 0; i < t.length; i++) { + ret[i] = s*t[i]; + } + + return ret; + } + + @Override + public void run() { + // TODO Auto-generated method stub + + } + + +} + + + + diff --git a/src/main/java/edu/uc/rphash/kneefinder/findknee.java b/src/main/java/edu/uc/rphash/kneefinder/findknee.java new file mode 100644 index 0000000..5253cb4 --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/findknee.java @@ -0,0 +1,25 @@ +package edu.uc.rphash.kneefinder; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; +import edu.uc.rphash.lsh.LSH; + +public class findknee implements Runnable { + + private float[] vec; + + + + @Override + public void run() { + // TODO Auto-generated method stub + + } + + +} + + + + From ec21452ae70c3ba1cd5ab96111b8b8c28d41f547 Mon Sep 17 00:00:00 2001 From: deysn Date: Wed, 21 Jul 2021 19:49:37 -0400 Subject: [PATCH 16/29] Adding the knee finding algorithm in java --- .../edu/uc/rphash/kneefinder/Kneedle.java | 170 +++++ .../edu/uc/rphash/kneefinder/findknee.java | 2 + src/main/java/edu/uc/rphash/util/Maths.java | 628 ++++++++++++++++++ 3 files changed, 800 insertions(+) create mode 100644 src/main/java/edu/uc/rphash/kneefinder/Kneedle.java create mode 100644 src/main/java/edu/uc/rphash/util/Maths.java diff --git a/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java new file mode 100644 index 0000000..d0a5224 --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java @@ -0,0 +1,170 @@ +package edu.uc.rphash.kneefinder; + + +import edu.uc.rphash.util.Maths; + +import java.util.ArrayList; + + +// to find the knee, taken from " https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java by Luke Bermingham" + +/** + * Given set of values look for the elbow/knee points. + * See paper: "Finding a Kneedle in a Haystack: Detecting Knee Points in System Behavior" + */ + + +public class Kneedle { + + /** + * Finds the indices of all local minimum or local maximum values. + * @param data The data to process + * @param findMinima If true find local minimums, else find local maximums. + * @return A list of the indices that have local minimum or maximum values. + */ + private ArrayList findCandidateIndices(double[][] data, boolean findMinima){ + ArrayList candidates = new ArrayList<>(); + //a coordinate is considered a candidate if both of its adjacent points have y-values + //that are greater or less (depending on whether we want local minima or local maxima) + for (int i = 1; i < data.length - 1; i++) { + double prev = data[i-1][1]; + double cur = data[i][1]; + double next = data[i+1][1]; + boolean isCandidate = (findMinima) ? (prev > cur && next > cur) : (prev < cur && next < cur); + if(isCandidate){ + candidates.add(i); + } + } + return candidates; + } + + + /** + * Find the index in the data the represents a most exaggerated elbow point. + * @param data the data to find an elbow in + * @return The index of the elbow point. + */ + private int findElbowIndex(double[] data){ + + int bestIdx = 0; + double bestScore = 0; + for (int i = 0; i < data.length; i++) { + double score = Math.abs(data[i]); + if(score > bestScore){ + bestScore = score; + bestIdx = i; + } + } + return bestIdx; + } + + /** + * Prepares the data by smoothing, then normalising into unit range 0-1, + * and finally, subtracting the y-value from the x-value. + * @param data The data to prepare. + * @param smoothingWindow Size of the smoothing window. + * @return The normalised data. + */ + private double[][] prepare(double[][] data, int smoothingWindow){ + + //smooth the data to make local minimum/maximum easier to find (this is Step 1 in the paper) + double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); + + //prepare the data into the unit range (step 2 of paper) + double[][] normalisedData = Maths.minmaxNormalise(smoothedData); + + //subtract normalised x from normalised y (this is step 3 in the paper) + for (int i = 0; i < normalisedData.length; i++) { + normalisedData[i][1] = normalisedData[i][1] - normalisedData[i][0]; + } + + return normalisedData; + } + + private double computeAverageVarianceX(double[][] data){ + double sumVariance = 0; + for (int i = 0; i < data.length - 1; i++) { + sumVariance += data[i + 1][0] - data[i][0]; + } + return sumVariance / (data.length - 1); + } + + /** + * Uses a heuristic to find what may be an elbow in the 1d data. + * This method is a heuristic so it may return in invalid elbow. + * If you need guarantees use the other method {@link Kneedle#run(double[][], double, int, boolean)} + * @param data The + * @return A possible elbow for this 1d data. + */ + public double findElbowQuick(double[] data){ + if(data.length <= 1){ + return 0; + } + + double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 3)); + + //do kneedle y'-x' (in this case x' is normalised index value) + for (int i = 0; i < normalisedData.length; i++) { + double normalisedIndex = (double)i / data.length; + normalisedData[i] = normalisedData[i] - normalisedIndex; + } + + int elbowIdx = findElbowIndex(normalisedData); + return data[elbowIdx]; + } + + /** + * This algorithm finds the so-called elbow/knee in the data. + * See paper: "Finding a Kneedle in a Haystack: Detecting Knee Points in System Behavior" + * for more details. + * @param data The 2d data to find an elbow in. + * @param s How many "flat" points to require before we consider it a knee/elbow. + * @param smoothingWindow The data is smoothed using Gaussian kernel average smoother, this parameter is the window used for averaging + * (higher values mean more smoothing, try 3 to begin with). + * @param findElbows Whether to find elbows or knees. + * @return The elbow or knee values. + */ + public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows){ + + if(data.length == 0){ + throw new IllegalArgumentException("Cannot find elbow or knee points in empty data."); + } + if(data[0].length != 2){ + throw new IllegalArgumentException("Cannot run Kneedle, this method expects all data to be 2d."); + } + + ArrayList localMinMaxPts = new ArrayList<>(); + //do steps 1,2,3 of the paper in the prepare method + double[][] normalisedData = prepare(data, smoothingWindow); + //find candidate indices (this is step 4 in the paper) + { + ArrayList candidateIndices = findCandidateIndices(normalisedData, findElbows); + //go through each candidate index, i, and see if the indices after i are satisfy the threshold requirement + //(this is step 5 in the paper) + double step = computeAverageVarianceX(normalisedData); + step = findElbows ? step * s : step * -s; + + //check each candidate to see if it is a real elbow/knee + //(this is step 6 in the paper) + for (int i = 0; i < candidateIndices.size(); i++) { + Integer candidateIdx = candidateIndices.get(i); + Integer endIdx = (i + 1 < candidateIndices.size()) ? candidateIndices.get(i+1) : data.length; + + double threshold = normalisedData[candidateIdx][1] + step; + + for (int j = candidateIdx + 1; j < endIdx; j++) { + boolean isRealElbowOrKnee = (findElbows) ? + normalisedData[j][1] > threshold : normalisedData[j][1] < threshold; + if(isRealElbowOrKnee) { + localMinMaxPts.add(data[candidateIdx]); + break; + } + } + } + } + return localMinMaxPts; + } + + + +} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/kneefinder/findknee.java b/src/main/java/edu/uc/rphash/kneefinder/findknee.java index 5253cb4..fafa53e 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/findknee.java +++ b/src/main/java/edu/uc/rphash/kneefinder/findknee.java @@ -4,6 +4,8 @@ import edu.uc.rphash.Readers.RPHashObject; import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; import edu.uc.rphash.lsh.LSH; +import java.util.ArrayList; + public class findknee implements Runnable { diff --git a/src/main/java/edu/uc/rphash/util/Maths.java b/src/main/java/edu/uc/rphash/util/Maths.java new file mode 100644 index 0000000..4f6364b --- /dev/null +++ b/src/main/java/edu/uc/rphash/util/Maths.java @@ -0,0 +1,628 @@ +package edu.uc.rphash.util; + +import java.util.*; + +// taken from " https://github.com/lukehb/137-common/blob/master/src/main/java/onethreeseven/common/util/Maths.java by Luke Bermingham " +/** + * A utility of mathematical methods. + */ + + + +public final class Maths { + + /** + * The height of normal distribution gaussian with std 1 and mean = 0. + */ + public static final double GaussHeight = 1.0/Math.sqrt(2 * Math.PI); + + private Maths() { + } + + /** + * Find the angle (in degrees) between two points, p1 and p2. + * @param x1 P1.x + * @param y1 P1.y + * @param x2 P2.x + * @param y2 P2.y + * @return The angle between p1 and p2 (in degrees). + */ + public static double angleBetween(double x1, double y1, double x2, double y2){ + double angle = Math.atan2(y2, x2) - Math.atan2(y1, x1); + if (angle < 0){ + angle += 2 * Math.PI; + } + return Math.toDegrees(angle); + } + + /** + * Find the point with the minimal pairwise distance between all points. + * @param pts some points + * @return The medoid point from the points data-set (i.e. the most central point). + */ + public static double[] medoid(double[][] pts){ + double bestDist = Double.MAX_VALUE; + double[] medoid = null; + + for (int i = 0; i < pts.length; i++) { + double[] pt = pts[i]; + double totalDisp = 0; + for (int j = 0; j < pts.length; j++) { + if(j == i){continue;} + totalDisp += dist(pts[i], pts[j]); + } + if(totalDisp < bestDist){ + bestDist = totalDisp; + medoid = pt; + } + } + return medoid; + } + + /** + * Find the maximum gap in a series of numerical data and find the middle of that largest gap. + * @param data The 1d numerical data. + * @return The largest gap. + */ + public static double[] maxGap(double[] data){ + double[] sorted = new double[data.length]; + System.arraycopy(data, 0, sorted, 0, data.length); + Arrays.sort(sorted); + + double maxGap = 0; + double[] minMax = new double[]{0,0}; + + for (int i = 1; i < data.length; i++) { + double gap = sorted[i] - sorted[i-1]; + if(gap > maxGap){ + maxGap = gap; + minMax[0] = sorted[i-1]; + minMax[1] = sorted[i]; + } + } + return minMax; + } + + /** + * The euclidean distance between two n-d points (order doesn't matter). + * @param a Point a + * @param b Point b + * @return The euclidean distance between two points. + */ + public static double dist(double[] a, double[] b){ + return Math.sqrt(Maths.distSq(a,b)); + } + + /** + * Returns the euclidean distance squared between two n-d points. + * @param a Point a. + * @param b Point b. + * @return The euclidean distance squared between two points. + */ + public static double distSq(double[] a, double[] b){ + double distSq = 0; + for (int i = 0; i < a.length; i++) { + distSq += Math.pow(a[i] - b[i], 2); + } + return distSq; + } + + /** + * @param x The variable input into the function. + * @param height The height of the center of the curve (sometimes called 'a'). + * @param center The center of the curve (sometimes called 'b'). + * @param width The standard deviation, i.e ~68% of the data will be contained in center ± the width. + * @return A gaussian function. + */ + public static double gaussian(double x, double height, double center, double width){ + return height * Math.exp(-(x-center)*(x-center)/(2.0*width*width) ); + } + + public static long mean(long[] d){ + long total = 0; + for (long v : d) { + total += v; + } + return total/d.length; + } + + public static void shuffle(int[] array){ + Random rand = new Random(); + for (int i = array.length - 1; i > 0; i--) + { + int index = rand.nextInt(i + 1); + // Simple swap + int a = array[index]; + array[index] = array[i]; + array[i] = a; + } + } + + public static double mean(double[] d){ + double total = 0; + for (double v : d) { + total += v; + } + return total/d.length; + } + + public static double std(double[] data){ + double mean = mean(data); + double std = 0; + for (double d : data) { + double deviation = d - mean; + std += deviation * deviation; + } + std /= data.length; + return Math.sqrt(std); + } + + public static long[] absDeviationsFromMedian(long[] data){ + long median = median(data); + long[] deviations = new long[data.length]; + for (int i = 0; i < data.length; i++) { + deviations[i] = Math.abs(data[i] - median); + } + return deviations; + } + + /** + * Calculate the absolute deviations a sample has away from its median. + * @param data The data to determine median and deviations for. + * @return An array of absolute deviations away from the median. + */ + public static double[] absDeviationsFromMedian(double[] data){ + double median = median(data); + double[] deviations = new double[data.length]; + for (int i = 0; i < data.length; i++) { + deviations[i] = Math.abs(data[i] - median); + } + return deviations; + } + + /** + * Linearly interpolate resolve a starting point towards some ending point. + * @param startPt The point to start at. + * @param endPt The point to head towards. + * @param alpha A value of 0 ends at the start pt, a value of 1 ends at the end point, a value + * greater than 1 over shoots the end point but continues following that same + * direction, likewise, a negative value heads backwards resolve the starting point + * with the end point reachable in a straight line. + * @return The newly interpolated position. + */ + public static double[] lerp(double[] startPt, double[] endPt, double alpha){ + if(startPt.length != endPt.length){ + throw new IllegalArgumentException("Start and end must have equal lengths."); + } + //we use c as the direction, and then as the final output + double[] c = new double[startPt.length]; + for (int i = 0; i < startPt.length; i++) { + c[i] = startPt[i] + ( (endPt[i] - startPt[i]) * alpha ); + } + return c; + } + + /** + * Do an element-wise subtraction such that, result[i] = a[i] - b[i]. + * @param a array a + * @param b array b + * @return the resulting "subtracted" result[] array. + */ + public static double[] sub(double[] a, double[] b){ + if(a.length != b.length){ + throw new IllegalArgumentException("Array A and B must be the same length."); + } + double[] result = new double[a.length]; + for (int i = 0; i < a.length; i++) { + result[i] = a[i] - b[i]; + } + return result; + } + + /** + * Do an element-wise subtraction such that, result[i] = a[i] - b[i]. + * @param a array a + * @param b array b + * @return the resulting "subtracted" result[] array. + */ + public static int[] sub(int[] a, int[] b){ + if(a.length != b.length){ + throw new IllegalArgumentException("Array A and B must be the same length."); + } + int[] result = new int[a.length]; + for (int i = 0; i < a.length; i++) { + result[i] = a[i] - b[i]; + } + return result; + } + + /** + * @param a the array + * @return The maximum absolute element in the array 'a'. + */ + public static int maxAbsElement(double[] a){ + int max = (int) Math.abs(a[0]); + for (int i = 1; i < a.length; i++) { + int element = (int) Math.abs(a[i]); + if(element > max){ + max = element; + } + } + return max; + } + + /** + * Divide every element in array 'a' by a given scalar. + * @param a the array + * @param scalar the divisor + * @return The array such that, result[i] = a[i] / scalar + */ + public static double[] div(double[] a, double scalar){ + double[] result = new double[a.length]; + for (int i = 0; i < a.length; i++) { + result[i] = a[i] / scalar; + } + return result; + } + + /** + * Find the area of a triangle defined by three points: a,b,c. + * @param ax X coordinate of point a. + * @param ay Y coordinate of point a. + * @param bx X coordinate of point b. + * @param by Y coordinate of point b. + * @param cx X coordinate of point c. + * @param cy Y coordinate of point c. + * @return The area of the triangle. + */ + public static double triArea(double ax, double ay, + double bx, double by, + double cx, double cy){ + return Math.abs((ax - cx) * (by - ay) - (ax - bx) * (cy - ay)) * 0.5; + } + + public static double triArea3D(double ax, double ay, double az, + double bx, double by, double bz, + double cx, double cy, double cz) { + return 0.5 * Math.sqrt(dotSq(ax, ay, bx, by, cx, cy) + + dotSq(ax, az, bx, bz, cx, cz) + dotSq(ay, az, by, bz, cy, cz)); + } + + /** + * Returns the cross product of two 3d vectors. + * @param a 3d vector "a". + * @param b 3d vector "b". + * @return The cross product of a and b. In other words, the vector that is orthogonal to a and b. + */ + public static double[] cross3d(double[] a, double[] b){ + if(a.length != 3){ + throw new IllegalArgumentException("Vector a length must equal 3."); + } + if(b.length != 3){ + throw new IllegalArgumentException("Vector b length must equal 3."); + } + return new double[]{ + a[1]*b[2] - a[2]*b[1], + a[2]*b[0] - a[0]*b[2], + a[0]*b[1] - a[1]*b[0] + }; + } + + /** + * Dot vector "a" against vector "b". + * That is, if a = [a1,a2,...,an] and b = [b1,b2,...,bn] + * then a dot b = a1*b1 + a2*b2 + ... + an*bn. + * @param a Vector a. + * @param b Vector b. + * @return The result of a dot b. + */ + public static double dot(double[] a, double[] b){ + if(a.length != b.length){ + throw new IllegalArgumentException("Vector 'a' must have the same length as vector 'b'."); + } + double dotProduct = 0; + for (int i = 0; i < a.length; i++) { + dotProduct += a[i]*b[i]; + } + return dotProduct; + } + + /** + * Multiply every component in "a" by a scalar. + * @param a The vector "a". + * @param scalar The scalar. + * @return A new vector. Original "a" is not modified. + */ + public static double[] scale(double[] a, double scalar){ + double[] aPrime = new double[a.length]; + for (int i = 0; i < a.length; i++) { + aPrime[i] = a[i] * scalar; + } + return aPrime; + } + + /** + * Add vectors a and b together in a component-wise fashion. + * @param a Vector a. + * @param b Vector b. + * @return Return vector a plus vector b in a new vector. + */ + public static double[] add(double[] a, double[] b){ + if(a.length != b.length){ + throw new IllegalArgumentException("Vector a and b must have the same lengths."); + } + double[] c = new double[a.length]; + for (int i = 0; i < a.length; i++) { + c[i] = a[i] + b[i]; + } + return c; + } + + /** + * Get the perpendicular distance between a point and a line formed by two other points. + * @param start The start point of the line. + * @param end The end point of the line. + * @param otherPt The other point to get the perpendicular distance from. + * @return The perpendicular distance between the point and the line. + */ + public static double perpendicularDistance(double[] start, double[] end, double[] otherPt){ + if(start.length != end.length || start.length != otherPt.length){ + throw new IllegalArgumentException("Vectors must have equal lengths."); + } + double[] projectedPt = projectAlong(start, end, otherPt); + return Maths.dist(otherPt, projectedPt); + } + + /** + * Given a line formed by start and end points and some other point, + * project that other point onto the line. + * @param start The start point. + * @param end The end point. + * @param otherPt The other point. + * @return The point projected onto the line. + */ + public static double[] projectAlong(double[] start, double[] end, double[] otherPt){ + if(start.length != end.length || start.length != otherPt.length){ + throw new IllegalArgumentException("Vectors must have equal lengths."); + } + double[] ab = Maths.sub(otherPt,start); + double[] ac = Maths.sub(end,start); + double percentageAlong = Maths.dot(ab, ac) / Maths.dot(ac, ac); + double[] amountMovedAC = Maths.scale(ac, percentageAlong); + return Maths.add(start, amountMovedAC); + } + + private static double dotSq(double ax, double ay, double bx, double by, double cx, double cy) { + double dot = ax * by - ax * cy + bx * cy - bx * ay + cx * ay - cx * by; + return dot * dot; + } + + public static double median(double[] data){ + if(data.length == 0){ + return Double.NaN; + } + double[] d = new double[data.length]; + System.arraycopy(data, 0, d, 0, data.length); + Arrays.sort(d); + int len = d.length; + if(len == 1){ + return d[0]; + } + //even case + else if(len % 2 == 0){ + int midRightIdx = (d.length) / 2; + int midLeftIdx = midRightIdx - 1; + return (d[midRightIdx] + d[midLeftIdx]) / 2.0; + } + //odd case + else{ + int midIdx = (d.length - 1) / 2; + return d[midIdx]; + } + } + + public static long median(long[] data){ + if(data.length == 0){ + throw new IllegalArgumentException("Data must have at least one element to find median."); + } + long[] d = new long[data.length]; + System.arraycopy(data, 0, d, 0, data.length); + Arrays.sort(d); + int len = d.length; + if(len == 1){ + return d[0]; + } + //even case + else if(len % 2 == 0){ + int midRightIdx = (d.length) / 2; + int midLeftIdx = midRightIdx - 1; + return (long) ((d[midRightIdx] + d[midLeftIdx]) / 2.0); + } + //odd case + else{ + int midIdx = (d.length - 1) / 2; + return d[midIdx]; + } + + } + + public static double mode(double[] data){ + HashMap tally = new HashMap<>(); + + for (double v : data) { + int nOccurrences = tally.getOrDefault(v, 0) + 1; + tally.put(v, nOccurrences); + } + + Optional> modalOpt = + tally.entrySet().stream().max((o1, o2) -> Integer.compare(o1.getValue(), o2.getValue())); + + if(modalOpt.isPresent()){ + return modalOpt.get().getKey(); + } + + return Double.NaN; + } + + /** + * Smooth the data using a gaussian kernel. + * @param data The data to smooth. + * @param n The size of sliding window (i.e number of indices either side to sample). + * @return The smoothed version of the data. + */ + public static double[] gaussianSmooth(double[] data, int n){ + double[] smoothed = new double[data.length]; + + for (int i = 0; i < data.length; i++) { + int startIdx = Math.max(0, i - n); + int endIdx = Math.min(data.length - 1, i + n); + + double sumWeights = 0; + double sumIndexWeight = 0; + + for (int j = startIdx; j < endIdx + 1; j++) { + double indexScore = Math.abs(j - i)/(double)n; + double indexWeight = Maths.gaussian(indexScore, 1, 0, 1); + sumWeights += (indexWeight * data[j]); + sumIndexWeight += indexWeight; + } + smoothed[i] = sumWeights/sumIndexWeight; + } + return smoothed; + } + + /** + * Smooth the data using a gaussian kernel. + * @param data The data to smooth. + * @param w The size of sliding window (i.e number of indices either side to sample). + * @return The smoothed version of the data. + */ + public static double[][] gaussianSmooth2d(double[][] data, int w){ + final int dataSize = data.length; + + if(dataSize == 0){ + throw new IllegalArgumentException("Cannot smooth empty data."); + } + + final int nDims = data[0].length; + + if(nDims == 0){ + throw new IllegalArgumentException("Cannot smooth a data point with no values. " + + "Uniformly populate every entry in your data with 1 or more dimensions."); + } + + double[][] smoothed = new double[dataSize][nDims]; + + for (int i = 0; i < dataSize; i++) { + int startIdx = Math.max(0, i - w); + int endIdx = Math.min(dataSize - 1, i + w); + + double[] sumWeights = new double[nDims]; + double sumIndexWeight = 0; + + for (int j = startIdx; j < endIdx + 1; j++) { + double indexScore = Math.abs(j - i)/(double)w; + double indexWeight = Maths.gaussian(indexScore, 1, 0, 1); + + for (int n = 0; n < nDims; n++) { + sumWeights[n] += (indexWeight * data[j][n]); + } + sumIndexWeight += indexWeight; + } + + for (int n = 0; n < nDims; n++) { + smoothed[i][n] = sumWeights[n]/sumIndexWeight; + } + } + return smoothed; + } + + /** + * Normalise the 1d data using min-max normalisation. + * @see Wikipedia article about feature re-scaling. + * @param data The data to normalise. + * @return The new array containing the normalised data. + */ + public static double[] minmaxNormalise1d(double[] data){ + //find min and max value + double curMin = Double.POSITIVE_INFINITY; + double curMax = Double.NEGATIVE_INFINITY; + for (double v : data) { + if(v < curMin){ + curMin = v; + } + if(v > curMax){ + curMax = v; + } + } + + //normalise the data using min-max normalisation + //and also subtract each value from its normalised index + final double range = curMax - curMin; + double[] normalisedData = new double[data.length]; + + for (int i = 0; i < normalisedData.length; i++) { + normalisedData[i] = ((data[i] - curMin) / range); + } + return normalisedData; + } + + /** + * Performs min-max normalisation on n-dimensional data (as long as the dimensionality is uniform, that is, all data is 2d or all 3d etc.). + * @see Wikipedia article about feature re-scaling. + * @param data The data to normalised. + * @return A new normalised data-set. + */ + public static double[][] minmaxNormalise(double[][] data){ + + final int dataSize = data.length; + + if(dataSize == 0){ + throw new IllegalArgumentException("Cannot smooth empty data."); + } + + final int nDims = data[0].length; + + if(nDims == 0){ + throw new IllegalArgumentException("Cannot smooth a data point with no values. " + + "Uniformly populate every entry in your data with 1 or more dimensions."); + } + + //1) get min and max for each dimension of the data + + double[] minEachDim = new double[nDims]; + double[] maxEachDim = new double[nDims]; + for (int i = 0; i < nDims; i++) { + minEachDim[i] = Double.POSITIVE_INFINITY; + maxEachDim[i] = Double.NEGATIVE_INFINITY; + } + + for (double[] coords : data) { + for (int n = 0; n < nDims; n++) { + double v = coords[n]; + if (v < minEachDim[n]) { + minEachDim[n] = v; + } + if (v > maxEachDim[n]) { + maxEachDim[n] = v; + } + } + } + + //2) normalise the data using the min and max + double[] rangeEachDim = new double[nDims]; + for (int n = 0; n < nDims; n++) { + rangeEachDim[n] = maxEachDim[n] - minEachDim[n]; + } + + double[][] outputNormalised = new double[dataSize][nDims]; + for (int i = 0; i < dataSize; i++) { + for (int n = 0; n < nDims; n++) { + //normalising step + outputNormalised[i][n] = (data[i][n] - minEachDim[n]) / rangeEachDim[n]; + } + } + return outputNormalised; + } + +} \ No newline at end of file From 9118d1f9cf00c4bf08723b8b59a8fe34be48ec98 Mon Sep 17 00:00:00 2001 From: deysn Date: Wed, 21 Jul 2021 19:54:36 -0400 Subject: [PATCH 17/29] Adding knee finding algorithm in java --- src/main/java/edu/uc/rphash/kneefinder/Kneedle.java | 2 +- src/main/java/edu/uc/rphash/util/Maths.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java index d0a5224..93fa888 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java +++ b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java @@ -6,7 +6,7 @@ import java.util.ArrayList; -// to find the knee, taken from " https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java by Luke Bermingham" +// to find the knee, taken from " https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java by Luke Bermingham " /** * Given set of values look for the elbow/knee points. diff --git a/src/main/java/edu/uc/rphash/util/Maths.java b/src/main/java/edu/uc/rphash/util/Maths.java index 4f6364b..48c4d20 100644 --- a/src/main/java/edu/uc/rphash/util/Maths.java +++ b/src/main/java/edu/uc/rphash/util/Maths.java @@ -2,7 +2,7 @@ import java.util.*; -// taken from " https://github.com/lukehb/137-common/blob/master/src/main/java/onethreeseven/common/util/Maths.java by Luke Bermingham " +// taken from " https://github.com/lukehb/137-common/blob/master/src/main/java/onethreeseven/common/util/Maths.java by Luke Bermingham " /** * A utility of mathematical methods. */ From 50541306b7b42e7192f5c0669798c6f6f07b0de8 Mon Sep 17 00:00:00 2001 From: deysn Date: Wed, 21 Jul 2021 19:54:36 -0400 Subject: [PATCH 18/29] Adding knee finding algorithm of python in java usisng jython integrating python with java --- .../edu/uc/rphash/kneefinder/JythonTest.java | 381 +++++++++ .../edu/uc/rphash/kneefinder/KneeLocator.java | 791 ++++++++++++++++++ .../edu/uc/rphash/kneefinder/Kneedle.java | 2 +- src/main/java/edu/uc/rphash/util/Maths.java | 2 +- 4 files changed, 1174 insertions(+), 2 deletions(-) create mode 100644 src/main/java/edu/uc/rphash/kneefinder/JythonTest.java create mode 100644 src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java new file mode 100644 index 0000000..f28633e --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java @@ -0,0 +1,381 @@ +package edu.uc.rphash.kneefinder; + + +import edu.uc.rphash.lsh.LSHkNN; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.Maths; +import edu.uc.rphash.util.VectorUtil; + +import java.util.ArrayList; +import java.util.Random; +import java.util.Arrays; + +// to find the knee, modified from " https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java by Luke Bermingham " + +/** + * Given set of values look for the elbow/knee points. + * See paper: "Finding a Kneedle in a Haystack: Detecting Knee Points in System Behavior" + */ + + +public class JythonTest { + + /** + * Finds the indices of all local minimum or local maximum values. + * @param data The data to process + * @param findMinima If true find local minimums, else find local maximums. + * @return A list of the indices that have local minimum or maximum values. + */ + private ArrayList findCandidateIndices(double[][] data, boolean findMinima){ + ArrayList candidates = new ArrayList<>(); + //a coordinate is considered a candidate if both of its adjacent points have y-values + //that are greater or less (depending on whether we want local minima or local maxima) + for (int i = 1; i < data.length - 1; i++) { + double prev = data[i-1][1]; + double cur = data[i][1]; + double next = data[i+1][1]; + boolean isCandidate = (findMinima) ? (prev > cur && next > cur) : (prev < cur && next < cur); + if(isCandidate){ + candidates.add(i); + } + } + return candidates; + } + + + /** + * Find the index in the data the represents a most exaggerated elbow point. + * @param data the data to find an elbow in + * @return The index of the elbow point. + */ + private int findElbowIndex(double[] data){ + + int bestIdx = 0; + double bestScore = 0; + for (int i = 0; i < data.length; i++) { + double score = Math.abs(data[i]); + if(score > bestScore){ + bestScore = score; + bestIdx = i; + } + } + return bestIdx; + } + + /** + * Prepares the data by smoothing, then normalising into unit range 0-1, + * and finally, subtracting the y-value from the x-value. + * @param data The data to prepare. + * @param smoothingWindow Size of the smoothing window. + * @return The normalised data. + */ + private double[][] prepare(double[][] data, int smoothingWindow){ + + //smooth the data to make local minimum/maximum easier to find (this is Step 1 in the paper) + double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); + + //prepare the data into the unit range (step 2 of paper) + double[][] normalisedData = Maths.minmaxNormalise(smoothedData); + + //subtract normalised x from normalised y (this is step 3 in the paper) + for (int i = 0; i < normalisedData.length; i++) { + normalisedData[i][1] = normalisedData[i][1] - normalisedData[i][0]; + } + + return normalisedData; + } + + private double computeAverageVarianceX(double[][] data){ + double sumVariance = 0; + for (int i = 0; i < data.length - 1; i++) { + sumVariance += data[i + 1][0] - data[i][0]; + } + return sumVariance / (data.length - 1); + } + + /** + * Uses a heuristic to find what may be an elbow in the 1d data. + * This method is a heuristic so it may return in invalid elbow. + * If you need guarantees use the other method {@link JythonTest#run(double[][], double, int, boolean)} + * @param data The + * @return A possible elbow for this 1d data. + */ + public double findElbowQuick(double[] data){ + if(data.length <= 1){ + return 0; + } + + // double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 3)); // original parameter + double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 1)); + //do kneedle y'-x' (in this case x' is normalised index value) + for (int i = 0; i < normalisedData.length; i++) { + double normalisedIndex = (double)i / data.length; + normalisedData[i] = normalisedData[i] - normalisedIndex; + } + + int elbowIdx = findElbowIndex(normalisedData); + return data[elbowIdx]; + } + + /** + * This algorithm finds the so-called elbow/knee in the data. + * See paper: "Finding a Kneedle in a Haystack: Detecting Knee Points in System Behavior" + * for more details. + * @param data The 2d data to find an elbow in. + * @param s How many "flat" points to require before we consider it a knee/elbow. + * @param smoothingWindow The data is smoothed using Gaussian kernel average smoother, this parameter is the window used for averaging + * (higher values mean more smoothing, try 3 to begin with). + * @param findElbows Whether to find elbows or knees. true for elbows and false for knees. + * @return The elbow or knee values. + */ + public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows){ + + if(data.length == 0){ + throw new IllegalArgumentException("Cannot find elbow or knee points in empty data."); + } + if(data[0].length != 2){ + throw new IllegalArgumentException("Cannot run Kneedle, this method expects all data to be 2d."); + } + + ArrayList localMinMaxPts = new ArrayList<>(); + //do steps 1,2,3 of the paper in the prepare method + double[][] normalisedData = prepare(data, smoothingWindow); + //find candidate indices (this is step 4 in the paper) + { + ArrayList candidateIndices = findCandidateIndices(normalisedData, findElbows); + //go through each candidate index, i, and see if the indices after i are satisfy the threshold requirement + //(this is step 5 in the paper) + double step = computeAverageVarianceX(normalisedData); + step = findElbows ? step * s : step * -s; + + //check each candidate to see if it is a real elbow/knee + //(this is step 6 in the paper) + for (int i = 0; i < candidateIndices.size(); i++) { + Integer candidateIdx = candidateIndices.get(i); + Integer endIdx = (i + 1 < candidateIndices.size()) ? candidateIndices.get(i+1) : data.length; + + double threshold = normalisedData[candidateIdx][1] + step; + + for (int j = candidateIdx + 1; j < endIdx; j++) { + boolean isRealElbowOrKnee = (findElbows) ? + normalisedData[j][1] > threshold : normalisedData[j][1] < threshold; + if(isRealElbowOrKnee) { + localMinMaxPts.add(data[candidateIdx]); + break; + } + } + } + } + return localMinMaxPts; + } + +// to test the funtion : + public static void main(String[] args){ + + JythonTest elbowcalculator = new JythonTest(); + + double elbowdata[]= new double[90]; + + for (int i=0 ; i<=89; i++) + { + elbowdata[i] = 89-i; + } + + +/* double elbowdata2 [] = + { 7304, 6978, 6666, 6463, 6326, 6048, 6032, 5762, 5742, + 5398, 5256, 5226, 5001, 4941, 4854, 4734, 4558, 4491, + 4411, 4333, 4234, 4139, 4056, 4022, 3867, 3808, 3745, + 3692, 3645, 3618, 3574, 3504, 3452, 3401, 3382, 3340, + 3301, 3247, 3190, 3179, 3154, 3089, 3045, 2988, 2993, + 2941, 2875, 2866, 2834, 2785, 2759, 2763, 2720, 2660, + 2690, 2635, 2632, 2574, 2555, 2545, 2513, 2491, 2496, + 2466, 2442, 2420, 2381, 2388, 2340, 2335, 2318, 2319, + 2308, 2262, 2235, 2259, 2221, 2202, 2184, 2170, 2160, + 2127, 2134, 2101, 2101, 2066, 2074, 2063, 2048, 2031 }; +*/ + double elbowdata2[] = {272445.84, + 139828.64, + 219647.36, + 149900.52, + 101875.555, + 90592.31, + 94776.5, + 59097.977, + 54506.95, + 70813.1, + 51619.59, + 72024.32, + 42364.402, + 49209.64, + 43121.777, + 58519.363, + 42506.32, + 53575.184, + 48930.42, + 67386.4, + 27424.889, + 58791.652, + 47980.53, + 57721.895, + 28586.846, + 47117.207, + 34060.79, + 46765.35, + 36411.176, + 38203.29, + 41664.164, + 30040.643, + 23410.227, + 37810.92, + 44158.805, + 36570.363, + 38791.527, + 26255.09, + 34368.848, + 33185.074, + 23464.494, + 58085.137, + 19323.424, + 28164.77, + 31947.02, + 34020.324, + 31572.951, + 40708.703, + 27046.771, + 37988.094, + 104162.72, + 33381.24, + 20126.354, + 23565.26, + 35915.094, + 34402.164, + 23505.94, + 25535.15, + 33915.32, + 25169.93, + 20888.271, + 36341.01, + 26020.947, + 29645.568, + 27043.643, + 24310.191, + 23757.668, + 19005.96, + 22007.072, + 17633.865, + 22680.45, + 11766.091, + 12725.509, + 34868.617, + 22989.531, + 23386.334, + 17618.283, + 22736.342, + 18922.049, + 24434.168, + 13263.041, + 9256.854, + 18594.143, + 21928.807, + 29263.688, + 16141.0205, + 14283.08, + 16031.739, + 14628.732, + 19026.465, + 16398.363, + 22941.205, + 25078.521, + 16121.506, + 10316.715, + 24983.184, + 17508.658, + 16489.285, + 9556.006, + 10829.478, + } ; + + double elbow_point = elbowcalculator.findElbowQuick(elbowdata2); + System.out.print("elbow point value form 1D data : "+ elbow_point); + + double[][] elbowdata3 = new double[100][2] ; + for (int i= 0;i<=99;i++) { + + elbowdata3[i][1]= 99-i;} + + for (int i= 0;i<=99;i++) + { + elbowdata3[i][0]= elbowdata2[i]; + } + // System.out.print("\n" +"elbowdata3 : " + elbowdata3[88][1]); + + // public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows) + + ArrayList elbows = elbowcalculator.run ( elbowdata3, 0 , 1 , false); + + System.out.print("\n" + "number of elbow points : " + elbows.size()); + for (double[] point : elbows) { + System.out.print("\n" +"Knee point:" + Arrays.toString(point)); + System.out.println("\n" +"No. of clusters complement = " + point[1] ); + System.out.println("\n" + "No. of clusters = " + (elbowdata3.length - point[1])); + } + + +// +// double[][] testData = new double[][]{ +// new double[]{0,0}, +// new double[]{0.1, 0.55}, +// new double[]{0.2, 0.75}, +// new double[]{0.35, 0.825}, +// new double[]{0.45, 0.875}, +// new double[]{0.55, 0.9}, +// new double[]{0.675, 0.925}, +// new double[]{0.775, 0.95}, +// new double[]{0.875, 0.975}, +// new double[]{1,1} +// }; +// +// +// ArrayList kneePoints = new Kneedle().run(testData, 1, 1, false); +// +// for (double[] kneePoint : kneePoints) { +// System.out.println(); +// System.out.print("Knee point:" + Arrays.toString(kneePoint)); +// } +// +// +// double[][] testData2 = new double[][]{ +// new double[] { 200 , 9 }, +// new double[] { 100 , 8 }, +// new double[] { 75 , 7 }, +// new double[] { 50 , 6 }, +// new double[] { 48 , 5 }, +// new double[] { 45 , 4 }, +// new double[] { 42 , 3 }, +// new double[] { 40 , 2 }, +// new double[] { 39 , 1 }, +// new double[] { 38 , 0 } +// +// +// }; +// System.out.print("\n" + testData2[9][0]); +// +//// public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows) +// ArrayList kneePoints2 = new Kneedle().run(testData2, 0, 1, false); +// +// for (double[] point : kneePoints2) { +// System.out.print("\n" +"Knee point:" + Arrays.toString(point)); +// System.out.println("\n" +"No. of clusters = " + point[1] ); +// System.out.println("\n" + "No. of clusters = " + (testData2.length - point[1])); +// } + + + + + + + } + + +} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java new file mode 100644 index 0000000..928b818 --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java @@ -0,0 +1,791 @@ +package edu.uc.rphash.kneefinder; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; +import edu.uc.rphash.lsh.LSH; +import java.util.ArrayList; +import org.python.util.PythonInterpreter; +import org.python.core.*; + + + +import sys + +from scipy.constants import convert_temperature + + + +import numpy as np + +from scipy import interpolate + +from scipy.signal import argrelextrema + +from sklearn.preprocessing import PolynomialFeatures + +from sklearn.linear_model import LinearRegression + +import warnings + +from typing import Tuple, Optional, Iterable + +import matplotlib.pyplot as plt + +import pandas as pd + + + +import warnings # did not install + + + + + +def my_test(name, age, file): + + filename=file + + print(filename) + + print("name: "+name) + + print("age: "+age) + + print("2^10 : ") + + print( np.power(2,10)) + +# temperature=convert_temperature(np.array([-40, 40]), "Celsius", "Kelvin") + +# print(temperature) + + + + return filename + + + +#my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling + +#my_test("sam","25", "name") # this is for the python test + + + + + + + +def set_data( x,y): + + x_data = x + + y_data = y + + return (x_data , y_data) + + + + + +# knee test code : + + + +class KneeLocator(object): + + + + + + def __init__( + + self, + + x: Iterable[float], + + y: Iterable[float], + + S: float = 1.0, + + curve: str = "concave", + + direction: str = "increasing", + + interp_method: str = "interp1d", + + online: bool = False, + + + + + + ): + + """ + + Once instantiated, this class attempts to find the point of maximum + + curvature on a line. The knee is accessible via the `.knee` attribute. + + :param x: x values. + + :param y: y values. + + :param S: Sensitivity, original paper suggests default of 1.0 + + :param curve: If 'concave', algorithm will detect knees. If 'convex', it + + will detect elbows. + + :param direction: one of {"increasing", "decreasing"} + + :param interp_method: one of {"interp1d", "polynomial"} + + :param online: Will correct old knee points if True, will return first knee if False + + """ + + # Step 0: Raw Input + + self.x = np.array(x) + + self.y = np.array(y) + + self.curve = curve + + self.direction = direction + + self.N = len(self.x) + + self.S = S + + self.all_knees = set() + + self.all_norm_knees = set() + + self.all_knees_y = [] + + self.all_norm_knees_y = [] + + self.online = online + + + + + + # Step 1: fit a smooth line + + if interp_method == "interp1d": + + uspline = interpolate.interp1d(self.x, self.y) + + self.Ds_y = uspline(self.x) + + elif interp_method == "polynomial": + + pn_model = PolynomialFeatures(7) + + xpn = pn_model.fit_transform(self.x.reshape(-1, 1)) + + regr_model = LinearRegression() + + regr_model.fit(xpn, self.y) + + self.Ds_y = regr_model.predict( + + pn_model.fit_transform(self.x.reshape(-1, 1)) + + ) + + else: + + raise ValueError( + + "{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format( + + interp_method + + ) + + ) + + + + # Step 2: normalize values + + self.x_normalized = self.__normalize(self.x) + + self.y_normalized = self.__normalize(self.Ds_y) + + + + # Step 3: Calculate the Difference curve + + self.x_normalized, self.y_normalized = self.transform_xy( + + self.x_normalized, self.y_normalized, self.direction, self.curve + + ) + + # normalized difference curve + + self.y_difference = self.y_normalized - self.x_normalized + + self.x_difference = self.x_normalized.copy() + + + + # Step 4: Identify local maxima/minima + + # local maxima + + self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0] + + self.x_difference_maxima = self.x_difference[self.maxima_indices] + + self.y_difference_maxima = self.y_difference[self.maxima_indices] + + + + # local minima + + self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0] + + self.x_difference_minima = self.x_difference[self.minima_indices] + + self.y_difference_minima = self.y_difference[self.minima_indices] + + + + # Step 5: Calculate thresholds + + self.Tmx = self.y_difference_maxima - ( + + self.S * np.abs(np.diff(self.x_normalized).mean()) + + ) + + + + # Step 6: find knee + + self.knee, self.norm_knee = self.find_knee() + + + + # Step 7: If we have a knee, extract data about it + + self.knee_y = self.norm_knee_y = None + + if self.knee: + + self.knee_y = self.y[self.x == self.knee][0] + + self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0] + + + + + + + + def set_filename_from_java(self,file): + + filename= file + + return filename + + + + @staticmethod + + def __normalize(a: Iterable[float]) -> Iterable[float]: + + """normalize an array + + :param a: The array to normalize + + """ + + return (a - min(a)) / (max(a) - min(a)) + + + + @staticmethod + + def transform_xy( + + x: Iterable[float], y: Iterable[float], direction: str, curve: str + + ) -> Tuple[Iterable[float], Iterable[float]]: + + """transform x and y to concave, increasing based on given direction and curve""" + + # convert elbows to knees + + if curve == "convex": + + x = x.max() - x + + y = y.max() - y + + # flip decreasing functions to increasing + + if direction == "decreasing": + + y = np.flip(y, axis=0) + + + + if curve == "convex": + + x = np.flip(x, axis=0) + + y = np.flip(y, axis=0) + + + + return x, y + + + + def find_knee(self,): + + """This function finds and sets the knee value and the normalized knee value. """ + + if not self.maxima_indices.size: + + warnings.warn( + + "No local maxima found in the difference curve\n" + + "The line is probably not polynomial, try plotting\n" + + "the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n" + + "Also check that you aren't mistakenly setting the curve argument", + + RuntimeWarning, + + ) + + return None, None + + + + # placeholder for which threshold region i is located in. + + maxima_threshold_index = 0 + + minima_threshold_index = 0 + + # traverse the difference curve + + for i, x in enumerate(self.x_difference): + + # skip points on the curve before the the first local maxima + + if i < self.maxima_indices[0]: + + continue + + + + j = i + 1 + + + + # reached the end of the curve + + if x == 1.0: + + break + + + + # if we're at a local max, increment the maxima threshold index and continue + + if (self.maxima_indices == i).any(): + + threshold = self.Tmx[maxima_threshold_index] + + threshold_index = i + + maxima_threshold_index += 1 + + # values in difference curve are at or after a local minimum + + if (self.minima_indices == i).any(): + + threshold = 0.0 + + minima_threshold_index += 1 + + + + if self.y_difference[j] < threshold: + + if self.curve == "convex": + + if self.direction == "decreasing": + + knee = self.x[threshold_index] + + norm_knee = self.x_normalized[threshold_index] + + else: + + knee = self.x[-(threshold_index + 1)] + + norm_knee = self.x_normalized[-(threshold_index + 1)] + + + + elif self.curve == "concave": + + if self.direction == "decreasing": + + knee = self.x[-(threshold_index + 1)] + + norm_knee = self.x_normalized[-(threshold_index + 1)] + + else: + + knee = self.x[threshold_index] + + norm_knee = self.x_normalized[threshold_index] + + + + # add the y value at the knee + + y_at_knee = self.y[self.x == knee][0] + + y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0] + + if knee not in self.all_knees: + + self.all_knees_y.append(y_at_knee) + + self.all_norm_knees_y.append(y_norm_at_knee) + + + + # now add the knee + + self.all_knees.add(knee) + + self.all_norm_knees.add(norm_knee) + + + + # if detecting in offline mode, return the first knee found + + if self.online is False: + + return knee, norm_knee + + + + if self.all_knees == set(): + + warnings.warn("No knee/elbow found") + + return None, None + + + + return knee, norm_knee + + + + def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None): + + """Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists. + + + + :param figsize: Optional[Tuple[int, int] + + The figure size of the plot. Example (12, 8) + + :return: NoReturn + + """ + + import matplotlib.pyplot as plt + + + + if figsize is None: + + figsize = (6, 6) + + + + plt.figure(figsize=figsize) + + plt.title("Normalized Knee Point") + + plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve") + + plt.plot(self.x_difference, self.y_difference, "r", label="difference curve") + + plt.xticks( + + np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1) + + ) + + plt.yticks( + + np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1) + + ) + + + + plt.vlines( + + self.norm_knee, + + plt.ylim()[0], + + plt.ylim()[1], + + linestyles="--", + + label="knee/elbow", + + ) + + plt.legend(loc="best") + + + + def plot_knee(self, figsize: Optional[Tuple[int, int]] = None): + + """ + + Plot the curve and the knee, if it exists + + + + :param figsize: Optional[Tuple[int, int] + + The figure size of the plot. Example (12, 8) + + :return: NoReturn + + """ + + import matplotlib.pyplot as plt + + + + if figsize is None: + + figsize = (6, 6) + + + + plt.figure(figsize=figsize) + + plt.title("Knee Point") + + plt.plot(self.x, self.y, "b", label="data") + + plt.vlines( + + self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow" + + ) + + plt.legend(loc="best") + + + + # Niceties for users working with elbows rather than knees + + @property + + def elbow(self): + + return self.knee + + + + @property + + def norm_elbow(self): + + return self.norm_knee + + + + @property + + def elbow_y(self): + + return self.knee_y + + + + @property + + def norm_elbow_y(self): + + return self.norm_knee_y + + + + @property + + def all_elbows(self): + + return self.all_knees + + + + @property + + def all_norm_elbows(self): + + return self.all_norm_knees + + + + @property + + def all_elbows_y(self): + + return self.all_knees_y + + + + @property + + def all_norm_elbows_y(self): + + return self.all_norm_knees_y + + + + + +## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + + +#df=pd.read_excel("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/testdata.xlsx") + +#df=pd.read_excel("data.xlsx", sheet_name='har2', header=None, na_values=['NA'], usecols="Aq,at",skiprows=range(97),nrows=6) + + + +nameoffile = my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling + +#nameoffile = my_test("sam","25", "C:/Users/sayan/Documents/testdata/data.xlsx") # this is for the python test + + + +#nameoffile_1 = "C:/Users/sayan/Documents/testdata/data.xlsx" + +df=pd.read_excel(nameoffile, sheet_name='Sheet1', header=None, na_values=['NA']) + +print(df) + +conv_arr= df.values + + + +#split matrix into 3 columns each into 1d array + +#print(conv_arr.shape) + +#print(conv_arr[1,1]) + +arr1 = np.delete(conv_arr,1,axis=1) + +arr2 = np.delete(conv_arr,0,axis=1) + + + +#converting into 1D array + +x = arr1.ravel() + +y = arr2.ravel() + + + +kn = KneeLocator(list(x), y , S=0.0, curve='convex', direction='decreasing',online=False ) #,interp_method='polynomial') + +#kn.set_filename_from_java("C:/Users/sayan/Documents/testdata/data.xlsx") + + + + + +kn2 = KneeLocator(list(x), y , S=1.0, curve='convex', direction='decreasing',online=False ) + +print(kn.knee) + +print(kn2.knee) + +#print(kn.norm_knee) + + + +# plt.style.use('ggplot') + +# plt.plot() + +# plt.xlabel('K (no. of clusters) ') + +# plt.ylabel('WCSSE') + +# #plt.title('Elbow method for optimal k.[data=HAR, k=4, Pred. k= %d]' %(kn.knee)) + +# plt.suptitle('Elbow Method For Optimal Cluster Determination [data=HAR_4clus, K=4, Pred.K = %d]' %(kn.knee),x=0.5, y=0.000, ha="center" , va="bottom") + +# plt.plot(x, y, 'bx-') + +# #plt.xscale('log') + +# plt.grid(True) + +# plt.xticks() + +# plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') + +# plt.savefig("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/graphs/test1.pdf") + +# plt.show() + +# + +# plt.style.use('ggplot') + +# plt.plot() + +# plt.xlabel('Buckets') + +# plt.ylabel('Counts') + +# plt.title('Elbow method for optimal k. [data=NOISE_30_1, k=10, Pred. k= %d]' %(kn2.knee)) + +# plt.plot(x, y, 'bx-') + +# #plt.xscale('log') + +# plt.grid(True) + +# plt.xticks() + +# plt.vlines(kn2.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') + +# plt.savefig("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/graphs/test2.pdf") + +# plt.show() diff --git a/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java index d0a5224..93fa888 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java +++ b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java @@ -6,7 +6,7 @@ import java.util.ArrayList; -// to find the knee, taken from " https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java by Luke Bermingham" +// to find the knee, taken from " https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java by Luke Bermingham " /** * Given set of values look for the elbow/knee points. diff --git a/src/main/java/edu/uc/rphash/util/Maths.java b/src/main/java/edu/uc/rphash/util/Maths.java index 4f6364b..48c4d20 100644 --- a/src/main/java/edu/uc/rphash/util/Maths.java +++ b/src/main/java/edu/uc/rphash/util/Maths.java @@ -2,7 +2,7 @@ import java.util.*; -// taken from " https://github.com/lukehb/137-common/blob/master/src/main/java/onethreeseven/common/util/Maths.java by Luke Bermingham " +// taken from " https://github.com/lukehb/137-common/blob/master/src/main/java/onethreeseven/common/util/Maths.java by Luke Bermingham " /** * A utility of mathematical methods. */ From b3d110071577be95beeeb86f12d19d5f28aa9dd6 Mon Sep 17 00:00:00 2001 From: deysn Date: Sat, 11 Dec 2021 03:17:38 -0500 Subject: [PATCH 19/29] Jython integration. Using python knee finding function in Java. --- src/main/java/edu/uc/rphash/kneefinder/JythonTest.java | 1 - src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java index f28633e..4ed89c2 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java @@ -17,7 +17,6 @@ * See paper: "Finding a Kneedle in a Haystack: Detecting Knee Points in System Behavior" */ - public class JythonTest { /** diff --git a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java index 928b818..2b4a03f 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java +++ b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java @@ -38,6 +38,7 @@ import warnings # did not install + From 1af3d59cf72576882f0238488636732c3df7f600 Mon Sep 17 00:00:00 2001 From: sayan Date: Sat, 11 Dec 2021 04:53:59 -0500 Subject: [PATCH 20/29] updating the knee finder --- .classpath | 1 + .pydevproject | 5 + .../edu/uc/rphash/kneefinder/JythonTest2.java | 47 + .../{KneeLocator.java => KneeLocator.py} | 1584 ++++++++--------- 4 files changed, 845 insertions(+), 792 deletions(-) create mode 100644 .pydevproject create mode 100644 src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java rename src/main/java/edu/uc/rphash/kneefinder/{KneeLocator.java => KneeLocator.py} (93%) diff --git a/.classpath b/.classpath index 0c97049..ba55ddb 100644 --- a/.classpath +++ b/.classpath @@ -9,5 +9,6 @@ + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 0000000..98ee0df --- /dev/null +++ b/.pydevproject @@ -0,0 +1,5 @@ + + + Default + python interpreter + diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java new file mode 100644 index 0000000..979795c --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java @@ -0,0 +1,47 @@ +package edu.uc.rphash.kneefinder; +import org.python.util.PythonInterpreter; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +import org.python.core.*; + +class JythonTest2 +{ + +//// does not work if there are external imports: + +// public static void main(String[] args) { +// PythonInterpreter interpreter = new PythonInterpreter(); +// +// interpreter.execfile("C:\\Users\\sayan\\eclipse-workspace\\pythonfunc\\pythonfunc1.py"); +// PyFunction function = (PyFunction)interpreter.get("my_test",PyFunction.class); +// PyObject pyobject = function.__call__(new PyString("huzhiweiww"),new PyString("2225")); +// System.out.println("anwser = " + pyobject.toString()); +// } +// + + + + public static void main(String[] args) { + + + + // xarray_1 = + // yarray_2= + String[] arguments = new String[] {"python", "C:\\Users\\sayan\\eclipse-workspace\\pythonfunc\\pythonfunc2.py" , "huzhiwei", "25", "C:/Users/sayan/Documents/testdata/data.xlsx"}; + try { + Process process = Runtime.getRuntime().exec(arguments); + BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line = null; + while ((line = in.readLine()) != null) { + System.out.println(line); + } + in.close(); + int re = process.waitFor(); + System.out.println(re); + } catch (Exception e) { + e.printStackTrace(); + } + } +} diff --git a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py similarity index 93% rename from src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java rename to src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py index 2b4a03f..4d017aa 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.java +++ b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py @@ -1,792 +1,792 @@ -package edu.uc.rphash.kneefinder; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; -import edu.uc.rphash.lsh.LSH; -import java.util.ArrayList; -import org.python.util.PythonInterpreter; -import org.python.core.*; - - - -import sys - -from scipy.constants import convert_temperature - - - -import numpy as np - -from scipy import interpolate - -from scipy.signal import argrelextrema - -from sklearn.preprocessing import PolynomialFeatures - -from sklearn.linear_model import LinearRegression - -import warnings - -from typing import Tuple, Optional, Iterable - -import matplotlib.pyplot as plt - -import pandas as pd - - - -import warnings # did not install - - - - - - -def my_test(name, age, file): - - filename=file - - print(filename) - - print("name: "+name) - - print("age: "+age) - - print("2^10 : ") - - print( np.power(2,10)) - -# temperature=convert_temperature(np.array([-40, 40]), "Celsius", "Kelvin") - -# print(temperature) - - - - return filename - - - -#my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling - -#my_test("sam","25", "name") # this is for the python test - - - - - - - -def set_data( x,y): - - x_data = x - - y_data = y - - return (x_data , y_data) - - - - - -# knee test code : - - - -class KneeLocator(object): - - - - - - def __init__( - - self, - - x: Iterable[float], - - y: Iterable[float], - - S: float = 1.0, - - curve: str = "concave", - - direction: str = "increasing", - - interp_method: str = "interp1d", - - online: bool = False, - - - - - - ): - - """ - - Once instantiated, this class attempts to find the point of maximum - - curvature on a line. The knee is accessible via the `.knee` attribute. - - :param x: x values. - - :param y: y values. - - :param S: Sensitivity, original paper suggests default of 1.0 - - :param curve: If 'concave', algorithm will detect knees. If 'convex', it - - will detect elbows. - - :param direction: one of {"increasing", "decreasing"} - - :param interp_method: one of {"interp1d", "polynomial"} - - :param online: Will correct old knee points if True, will return first knee if False - - """ - - # Step 0: Raw Input - - self.x = np.array(x) - - self.y = np.array(y) - - self.curve = curve - - self.direction = direction - - self.N = len(self.x) - - self.S = S - - self.all_knees = set() - - self.all_norm_knees = set() - - self.all_knees_y = [] - - self.all_norm_knees_y = [] - - self.online = online - - - - - - # Step 1: fit a smooth line - - if interp_method == "interp1d": - - uspline = interpolate.interp1d(self.x, self.y) - - self.Ds_y = uspline(self.x) - - elif interp_method == "polynomial": - - pn_model = PolynomialFeatures(7) - - xpn = pn_model.fit_transform(self.x.reshape(-1, 1)) - - regr_model = LinearRegression() - - regr_model.fit(xpn, self.y) - - self.Ds_y = regr_model.predict( - - pn_model.fit_transform(self.x.reshape(-1, 1)) - - ) - - else: - - raise ValueError( - - "{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format( - - interp_method - - ) - - ) - - - - # Step 2: normalize values - - self.x_normalized = self.__normalize(self.x) - - self.y_normalized = self.__normalize(self.Ds_y) - - - - # Step 3: Calculate the Difference curve - - self.x_normalized, self.y_normalized = self.transform_xy( - - self.x_normalized, self.y_normalized, self.direction, self.curve - - ) - - # normalized difference curve - - self.y_difference = self.y_normalized - self.x_normalized - - self.x_difference = self.x_normalized.copy() - - - - # Step 4: Identify local maxima/minima - - # local maxima - - self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0] - - self.x_difference_maxima = self.x_difference[self.maxima_indices] - - self.y_difference_maxima = self.y_difference[self.maxima_indices] - - - - # local minima - - self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0] - - self.x_difference_minima = self.x_difference[self.minima_indices] - - self.y_difference_minima = self.y_difference[self.minima_indices] - - - - # Step 5: Calculate thresholds - - self.Tmx = self.y_difference_maxima - ( - - self.S * np.abs(np.diff(self.x_normalized).mean()) - - ) - - - - # Step 6: find knee - - self.knee, self.norm_knee = self.find_knee() - - - - # Step 7: If we have a knee, extract data about it - - self.knee_y = self.norm_knee_y = None - - if self.knee: - - self.knee_y = self.y[self.x == self.knee][0] - - self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0] - - - - - - - - def set_filename_from_java(self,file): - - filename= file - - return filename - - - - @staticmethod - - def __normalize(a: Iterable[float]) -> Iterable[float]: - - """normalize an array - - :param a: The array to normalize - - """ - - return (a - min(a)) / (max(a) - min(a)) - - - - @staticmethod - - def transform_xy( - - x: Iterable[float], y: Iterable[float], direction: str, curve: str - - ) -> Tuple[Iterable[float], Iterable[float]]: - - """transform x and y to concave, increasing based on given direction and curve""" - - # convert elbows to knees - - if curve == "convex": - - x = x.max() - x - - y = y.max() - y - - # flip decreasing functions to increasing - - if direction == "decreasing": - - y = np.flip(y, axis=0) - - - - if curve == "convex": - - x = np.flip(x, axis=0) - - y = np.flip(y, axis=0) - - - - return x, y - - - - def find_knee(self,): - - """This function finds and sets the knee value and the normalized knee value. """ - - if not self.maxima_indices.size: - - warnings.warn( - - "No local maxima found in the difference curve\n" - - "The line is probably not polynomial, try plotting\n" - - "the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n" - - "Also check that you aren't mistakenly setting the curve argument", - - RuntimeWarning, - - ) - - return None, None - - - - # placeholder for which threshold region i is located in. - - maxima_threshold_index = 0 - - minima_threshold_index = 0 - - # traverse the difference curve - - for i, x in enumerate(self.x_difference): - - # skip points on the curve before the the first local maxima - - if i < self.maxima_indices[0]: - - continue - - - - j = i + 1 - - - - # reached the end of the curve - - if x == 1.0: - - break - - - - # if we're at a local max, increment the maxima threshold index and continue - - if (self.maxima_indices == i).any(): - - threshold = self.Tmx[maxima_threshold_index] - - threshold_index = i - - maxima_threshold_index += 1 - - # values in difference curve are at or after a local minimum - - if (self.minima_indices == i).any(): - - threshold = 0.0 - - minima_threshold_index += 1 - - - - if self.y_difference[j] < threshold: - - if self.curve == "convex": - - if self.direction == "decreasing": - - knee = self.x[threshold_index] - - norm_knee = self.x_normalized[threshold_index] - - else: - - knee = self.x[-(threshold_index + 1)] - - norm_knee = self.x_normalized[-(threshold_index + 1)] - - - - elif self.curve == "concave": - - if self.direction == "decreasing": - - knee = self.x[-(threshold_index + 1)] - - norm_knee = self.x_normalized[-(threshold_index + 1)] - - else: - - knee = self.x[threshold_index] - - norm_knee = self.x_normalized[threshold_index] - - - - # add the y value at the knee - - y_at_knee = self.y[self.x == knee][0] - - y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0] - - if knee not in self.all_knees: - - self.all_knees_y.append(y_at_knee) - - self.all_norm_knees_y.append(y_norm_at_knee) - - - - # now add the knee - - self.all_knees.add(knee) - - self.all_norm_knees.add(norm_knee) - - - - # if detecting in offline mode, return the first knee found - - if self.online is False: - - return knee, norm_knee - - - - if self.all_knees == set(): - - warnings.warn("No knee/elbow found") - - return None, None - - - - return knee, norm_knee - - - - def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None): - - """Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists. - - - - :param figsize: Optional[Tuple[int, int] - - The figure size of the plot. Example (12, 8) - - :return: NoReturn - - """ - - import matplotlib.pyplot as plt - - - - if figsize is None: - - figsize = (6, 6) - - - - plt.figure(figsize=figsize) - - plt.title("Normalized Knee Point") - - plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve") - - plt.plot(self.x_difference, self.y_difference, "r", label="difference curve") - - plt.xticks( - - np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1) - - ) - - plt.yticks( - - np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1) - - ) - - - - plt.vlines( - - self.norm_knee, - - plt.ylim()[0], - - plt.ylim()[1], - - linestyles="--", - - label="knee/elbow", - - ) - - plt.legend(loc="best") - - - - def plot_knee(self, figsize: Optional[Tuple[int, int]] = None): - - """ - - Plot the curve and the knee, if it exists - - - - :param figsize: Optional[Tuple[int, int] - - The figure size of the plot. Example (12, 8) - - :return: NoReturn - - """ - - import matplotlib.pyplot as plt - - - - if figsize is None: - - figsize = (6, 6) - - - - plt.figure(figsize=figsize) - - plt.title("Knee Point") - - plt.plot(self.x, self.y, "b", label="data") - - plt.vlines( - - self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow" - - ) - - plt.legend(loc="best") - - - - # Niceties for users working with elbows rather than knees - - @property - - def elbow(self): - - return self.knee - - - - @property - - def norm_elbow(self): - - return self.norm_knee - - - - @property - - def elbow_y(self): - - return self.knee_y - - - - @property - - def norm_elbow_y(self): - - return self.norm_knee_y - - - - @property - - def all_elbows(self): - - return self.all_knees - - - - @property - - def all_norm_elbows(self): - - return self.all_norm_knees - - - - @property - - def all_elbows_y(self): - - return self.all_knees_y - - - - @property - - def all_norm_elbows_y(self): - - return self.all_norm_knees_y - - - - - -## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx - - - -#df=pd.read_excel("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/testdata.xlsx") - -#df=pd.read_excel("data.xlsx", sheet_name='har2', header=None, na_values=['NA'], usecols="Aq,at",skiprows=range(97),nrows=6) - - - -nameoffile = my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling - -#nameoffile = my_test("sam","25", "C:/Users/sayan/Documents/testdata/data.xlsx") # this is for the python test - - - -#nameoffile_1 = "C:/Users/sayan/Documents/testdata/data.xlsx" - -df=pd.read_excel(nameoffile, sheet_name='Sheet1', header=None, na_values=['NA']) - -print(df) - -conv_arr= df.values - - - -#split matrix into 3 columns each into 1d array - -#print(conv_arr.shape) - -#print(conv_arr[1,1]) - -arr1 = np.delete(conv_arr,1,axis=1) - -arr2 = np.delete(conv_arr,0,axis=1) - - - -#converting into 1D array - -x = arr1.ravel() - -y = arr2.ravel() - - - -kn = KneeLocator(list(x), y , S=0.0, curve='convex', direction='decreasing',online=False ) #,interp_method='polynomial') - -#kn.set_filename_from_java("C:/Users/sayan/Documents/testdata/data.xlsx") - - - - - -kn2 = KneeLocator(list(x), y , S=1.0, curve='convex', direction='decreasing',online=False ) - -print(kn.knee) - -print(kn2.knee) - -#print(kn.norm_knee) - - - -# plt.style.use('ggplot') - -# plt.plot() - -# plt.xlabel('K (no. of clusters) ') - -# plt.ylabel('WCSSE') - -# #plt.title('Elbow method for optimal k.[data=HAR, k=4, Pred. k= %d]' %(kn.knee)) - -# plt.suptitle('Elbow Method For Optimal Cluster Determination [data=HAR_4clus, K=4, Pred.K = %d]' %(kn.knee),x=0.5, y=0.000, ha="center" , va="bottom") - -# plt.plot(x, y, 'bx-') - -# #plt.xscale('log') - -# plt.grid(True) - -# plt.xticks() - -# plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') - -# plt.savefig("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/graphs/test1.pdf") - -# plt.show() - -# - -# plt.style.use('ggplot') - -# plt.plot() - -# plt.xlabel('Buckets') - -# plt.ylabel('Counts') - -# plt.title('Elbow method for optimal k. [data=NOISE_30_1, k=10, Pred. k= %d]' %(kn2.knee)) - -# plt.plot(x, y, 'bx-') - -# #plt.xscale('log') - -# plt.grid(True) - -# plt.xticks() - -# plt.vlines(kn2.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') - -# plt.savefig("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/graphs/test2.pdf") - -# plt.show() +#package edu.uc.rphash.kneefinder; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; +import edu.uc.rphash.lsh.LSH; +import java.util.ArrayList; +#import org.python.util.PythonInterpreter; +#import org.python.core.*; + + + +import sys + +from scipy.constants import convert_temperature + + + +import numpy as np + +from scipy import interpolate + +from scipy.signal import argrelextrema + +from sklearn.preprocessing import PolynomialFeatures + +from sklearn.linear_model import LinearRegression + +import warnings + +from typing import Tuple, Optional, Iterable + +import matplotlib.pyplot as plt + +import pandas as pd + + + +import warnings # did not install + + + + + + +def my_test(name, age, file): + + filename=file + + print(filename) + + print("name: "+name) + + print("age: "+age) + + print("2^10 : ") + + print( np.power(2,10)) + +# temperature=convert_temperature(np.array([-40, 40]), "Celsius", "Kelvin") + +# print(temperature) + + + + return filename + + + +#my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling + +#my_test("sam","25", "name") # this is for the python test + + + + + + + +def set_data( x,y): + + x_data = x + + y_data = y + + return (x_data , y_data) + + + + + +# knee test code : + + + +class KneeLocator(object): + + + + + + def __init__( + + self, + + x: Iterable[float], + + y: Iterable[float], + + S: float = 1.0, + + curve: str = "concave", + + direction: str = "increasing", + + interp_method: str = "interp1d", + + online: bool = False, + + + + + + ): + + """ + + Once instantiated, this class attempts to find the point of maximum + + curvature on a line. The knee is accessible via the `.knee` attribute. + + :param x: x values. + + :param y: y values. + + :param S: Sensitivity, original paper suggests default of 1.0 + + :param curve: If 'concave', algorithm will detect knees. If 'convex', it + + will detect elbows. + + :param direction: one of {"increasing", "decreasing"} + + :param interp_method: one of {"interp1d", "polynomial"} + + :param online: Will correct old knee points if True, will return first knee if False + + """ + + # Step 0: Raw Input + + self.x = np.array(x) + + self.y = np.array(y) + + self.curve = curve + + self.direction = direction + + self.N = len(self.x) + + self.S = S + + self.all_knees = set() + + self.all_norm_knees = set() + + self.all_knees_y = [] + + self.all_norm_knees_y = [] + + self.online = online + + + + + + # Step 1: fit a smooth line + + if interp_method == "interp1d": + + uspline = interpolate.interp1d(self.x, self.y) + + self.Ds_y = uspline(self.x) + + elif interp_method == "polynomial": + + pn_model = PolynomialFeatures(7) + + xpn = pn_model.fit_transform(self.x.reshape(-1, 1)) + + regr_model = LinearRegression() + + regr_model.fit(xpn, self.y) + + self.Ds_y = regr_model.predict( + + pn_model.fit_transform(self.x.reshape(-1, 1)) + + ) + + else: + + raise ValueError( + + "{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format( + + interp_method + + ) + + ) + + + + # Step 2: normalize values + + self.x_normalized = self.__normalize(self.x) + + self.y_normalized = self.__normalize(self.Ds_y) + + + + # Step 3: Calculate the Difference curve + + self.x_normalized, self.y_normalized = self.transform_xy( + + self.x_normalized, self.y_normalized, self.direction, self.curve + + ) + + # normalized difference curve + + self.y_difference = self.y_normalized - self.x_normalized + + self.x_difference = self.x_normalized.copy() + + + + # Step 4: Identify local maxima/minima + + # local maxima + + self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0] + + self.x_difference_maxima = self.x_difference[self.maxima_indices] + + self.y_difference_maxima = self.y_difference[self.maxima_indices] + + + + # local minima + + self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0] + + self.x_difference_minima = self.x_difference[self.minima_indices] + + self.y_difference_minima = self.y_difference[self.minima_indices] + + + + # Step 5: Calculate thresholds + + self.Tmx = self.y_difference_maxima - ( + + self.S * np.abs(np.diff(self.x_normalized).mean()) + + ) + + + + # Step 6: find knee + + self.knee, self.norm_knee = self.find_knee() + + + + # Step 7: If we have a knee, extract data about it + + self.knee_y = self.norm_knee_y = None + + if self.knee: + + self.knee_y = self.y[self.x == self.knee][0] + + self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0] + + + + + + + + def set_filename_from_java(self,file): + + filename= file + + return filename + + + + @staticmethod + + def __normalize(a: Iterable[float]) -> Iterable[float]: + + """normalize an array + + :param a: The array to normalize + + """ + + return (a - min(a)) / (max(a) - min(a)) + + + + @staticmethod + + def transform_xy( + + x: Iterable[float], y: Iterable[float], direction: str, curve: str + + ) -> Tuple[Iterable[float], Iterable[float]]: + + """transform x and y to concave, increasing based on given direction and curve""" + + # convert elbows to knees + + if curve == "convex": + + x = x.max() - x + + y = y.max() - y + + # flip decreasing functions to increasing + + if direction == "decreasing": + + y = np.flip(y, axis=0) + + + + if curve == "convex": + + x = np.flip(x, axis=0) + + y = np.flip(y, axis=0) + + + + return x, y + + + + def find_knee(self,): + + """This function finds and sets the knee value and the normalized knee value. """ + + if not self.maxima_indices.size: + + warnings.warn( + + "No local maxima found in the difference curve\n" + + "The line is probably not polynomial, try plotting\n" + + "the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n" + + "Also check that you aren't mistakenly setting the curve argument", + + RuntimeWarning, + + ) + + return None, None + + + + # placeholder for which threshold region i is located in. + + maxima_threshold_index = 0 + + minima_threshold_index = 0 + + # traverse the difference curve + + for i, x in enumerate(self.x_difference): + + # skip points on the curve before the the first local maxima + + if i < self.maxima_indices[0]: + + continue + + + + j = i + 1 + + + + # reached the end of the curve + + if x == 1.0: + + break + + + + # if we're at a local max, increment the maxima threshold index and continue + + if (self.maxima_indices == i).any(): + + threshold = self.Tmx[maxima_threshold_index] + + threshold_index = i + + maxima_threshold_index += 1 + + # values in difference curve are at or after a local minimum + + if (self.minima_indices == i).any(): + + threshold = 0.0 + + minima_threshold_index += 1 + + + + if self.y_difference[j] < threshold: + + if self.curve == "convex": + + if self.direction == "decreasing": + + knee = self.x[threshold_index] + + norm_knee = self.x_normalized[threshold_index] + + else: + + knee = self.x[-(threshold_index + 1)] + + norm_knee = self.x_normalized[-(threshold_index + 1)] + + + + elif self.curve == "concave": + + if self.direction == "decreasing": + + knee = self.x[-(threshold_index + 1)] + + norm_knee = self.x_normalized[-(threshold_index + 1)] + + else: + + knee = self.x[threshold_index] + + norm_knee = self.x_normalized[threshold_index] + + + + # add the y value at the knee + + y_at_knee = self.y[self.x == knee][0] + + y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0] + + if knee not in self.all_knees: + + self.all_knees_y.append(y_at_knee) + + self.all_norm_knees_y.append(y_norm_at_knee) + + + + # now add the knee + + self.all_knees.add(knee) + + self.all_norm_knees.add(norm_knee) + + + + # if detecting in offline mode, return the first knee found + + if self.online is False: + + return knee, norm_knee + + + + if self.all_knees == set(): + + warnings.warn("No knee/elbow found") + + return None, None + + + + return knee, norm_knee + + + + def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None): + + """Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists. + + + + :param figsize: Optional[Tuple[int, int] + + The figure size of the plot. Example (12, 8) + + :return: NoReturn + + """ + + import matplotlib.pyplot as plt + + + + if figsize is None: + + figsize = (6, 6) + + + + plt.figure(figsize=figsize) + + plt.title("Normalized Knee Point") + + plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve") + + plt.plot(self.x_difference, self.y_difference, "r", label="difference curve") + + plt.xticks( + + np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1) + + ) + + plt.yticks( + + np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1) + + ) + + + + plt.vlines( + + self.norm_knee, + + plt.ylim()[0], + + plt.ylim()[1], + + linestyles="--", + + label="knee/elbow", + + ) + + plt.legend(loc="best") + + + + def plot_knee(self, figsize: Optional[Tuple[int, int]] = None): + + """ + + Plot the curve and the knee, if it exists + + + + :param figsize: Optional[Tuple[int, int] + + The figure size of the plot. Example (12, 8) + + :return: NoReturn + + """ + + import matplotlib.pyplot as plt + + + + if figsize is None: + + figsize = (6, 6) + + + + plt.figure(figsize=figsize) + + plt.title("Knee Point") + + plt.plot(self.x, self.y, "b", label="data") + + plt.vlines( + + self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow" + + ) + + plt.legend(loc="best") + + + + # Niceties for users working with elbows rather than knees + + @property + + def elbow(self): + + return self.knee + + + + @property + + def norm_elbow(self): + + return self.norm_knee + + + + @property + + def elbow_y(self): + + return self.knee_y + + + + @property + + def norm_elbow_y(self): + + return self.norm_knee_y + + + + @property + + def all_elbows(self): + + return self.all_knees + + + + @property + + def all_norm_elbows(self): + + return self.all_norm_knees + + + + @property + + def all_elbows_y(self): + + return self.all_knees_y + + + + @property + + def all_norm_elbows_y(self): + + return self.all_norm_knees_y + + + + + +## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + + +#df=pd.read_excel("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/testdata.xlsx") + +#df=pd.read_excel("data.xlsx", sheet_name='har2', header=None, na_values=['NA'], usecols="Aq,at",skiprows=range(97),nrows=6) + + + +nameoffile = my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling + +#nameoffile = my_test("sam","25", "C:/Users/sayan/Documents/testdata/data.xlsx") # this is for the python test + + + +#nameoffile_1 = "C:/Users/sayan/Documents/testdata/data.xlsx" + +df=pd.read_excel(nameoffile, sheet_name='Sheet1', header=None, na_values=['NA']) + +print(df) + +conv_arr= df.values + + + +#split matrix into 3 columns each into 1d array + +#print(conv_arr.shape) + +#print(conv_arr[1,1]) + +arr1 = np.delete(conv_arr,1,axis=1) + +arr2 = np.delete(conv_arr,0,axis=1) + + + +#converting into 1D array + +x = arr1.ravel() + +y = arr2.ravel() + + + +kn = KneeLocator(list(x), y , S=0.0, curve='convex', direction='decreasing',online=False ) #,interp_method='polynomial') + +#kn.set_filename_from_java("C:/Users/sayan/Documents/testdata/data.xlsx") + + + + + +kn2 = KneeLocator(list(x), y , S=1.0, curve='convex', direction='decreasing',online=False ) + +print(kn.knee) + +print(kn2.knee) + +#print(kn.norm_knee) + + + +# plt.style.use('ggplot') + +# plt.plot() + +# plt.xlabel('K (no. of clusters) ') + +# plt.ylabel('WCSSE') + +# #plt.title('Elbow method for optimal k.[data=HAR, k=4, Pred. k= %d]' %(kn.knee)) + +# plt.suptitle('Elbow Method For Optimal Cluster Determination [data=HAR_4clus, K=4, Pred.K = %d]' %(kn.knee),x=0.5, y=0.000, ha="center" , va="bottom") + +# plt.plot(x, y, 'bx-') + +# #plt.xscale('log') + +# plt.grid(True) + +# plt.xticks() + +# plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') + +# plt.savefig("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/graphs/test1.pdf") + +# plt.show() + +# + +# plt.style.use('ggplot') + +# plt.plot() + +# plt.xlabel('Buckets') + +# plt.ylabel('Counts') + +# plt.title('Elbow method for optimal k. [data=NOISE_30_1, k=10, Pred. k= %d]' %(kn2.knee)) + +# plt.plot(x, y, 'bx-') + +# #plt.xscale('log') + +# plt.grid(True) + +# plt.xticks() + +# plt.vlines(kn2.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') + +# plt.savefig("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/graphs/test2.pdf") + +# plt.show() From 386fb2eac7bd2d1fcc25fcb8fa017eca4aac1240 Mon Sep 17 00:00:00 2001 From: Sayantan Date: Tue, 21 Dec 2021 13:54:29 -0500 Subject: [PATCH 21/29] updating whatever is on the local machine. --- src/main/java/edu/uc/rphash/kneefinder/JythonTest.java | 1 + src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java index 4ed89c2..1cd0689 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java @@ -167,6 +167,7 @@ public ArrayList run(double[][] data, double s, int smoothingWindow, b } return localMinMaxPts; } + // to test the funtion : public static void main(String[] args){ diff --git a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py index 4d017aa..3ba35f0 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py +++ b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py @@ -699,6 +699,7 @@ def all_norm_elbows_y(self): conv_arr= df.values + #split matrix into 3 columns each into 1d array From 9d88578655cc7d55ca563dd7f2245041f59143cf Mon Sep 17 00:00:00 2001 From: Sayantan Date: Mon, 11 Apr 2022 02:00:57 -0400 Subject: [PATCH 22/29] implemented aging using micro clusters implemented aging using centroids implemented tracking of centroids implemented decay functions --- src/main/java/edu/uc/rphash/aging/Decay.java | 54 ++++++ .../edu/uc/rphash/aging/DecayPositional.java | 65 +++++++ .../edu/uc/rphash/aging/ageCentriods.java | 29 +++- .../edu/uc/rphash/aging/ageMicrocluster.java | 54 ++++++ .../centroidTracker/trackCentroids.java | 158 ++++++++++++++++-- 5 files changed, 348 insertions(+), 12 deletions(-) create mode 100644 src/main/java/edu/uc/rphash/aging/Decay.java create mode 100644 src/main/java/edu/uc/rphash/aging/DecayPositional.java create mode 100644 src/main/java/edu/uc/rphash/aging/ageMicrocluster.java diff --git a/src/main/java/edu/uc/rphash/aging/Decay.java b/src/main/java/edu/uc/rphash/aging/Decay.java new file mode 100644 index 0000000..6edd861 --- /dev/null +++ b/src/main/java/edu/uc/rphash/aging/Decay.java @@ -0,0 +1,54 @@ +package edu.uc.rphash.aging; + +public class Decay implements Runnable { + + + // public double value; + public double t; + public double decayRate; + + + @Override + public void run() { + + } + +public static double ExpDecayFormula ( Number halfLifeInSeconds , float t ) { + + Double decayRate = - Math.log(2) / halfLifeInSeconds.longValue() / 1000; + + Double expMultiplier = Math.pow(Math.E, decayRate * t); + return expMultiplier; + + } + +public static double LinearDecayFormula ( Number lifeTimeInSeconds , float t ) { + + + Double lifeTime = Double.valueOf(lifeTimeInSeconds.longValue()) * 1000; + + if (t < 0 || t > lifeTime ) { + Double linearMultiplier = -0.1; // explain + return linearMultiplier; + } + else { + Double linearMultiplier =(1 - t / lifeTime); + return linearMultiplier; + } + +} + +public static double LogDecayFormula (long lifeTimeInSeconds , float t) { + + + Double lifeTime = Double.valueOf(lifeTimeInSeconds) * 1000; + + if (t < 0 || t >= lifeTime ) { + return 0.0; + } else { + // return value + 1 - Math.pow(Math.E, Math.log(value + 1)/lifeTime*t); + return lifeTime; + } + } + +} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/aging/DecayPositional.java b/src/main/java/edu/uc/rphash/aging/DecayPositional.java new file mode 100644 index 0000000..197ba08 --- /dev/null +++ b/src/main/java/edu/uc/rphash/aging/DecayPositional.java @@ -0,0 +1,65 @@ +package edu.uc.rphash.aging; + +public class DecayPositional implements Runnable { + + // public double value; + public double t; + public double decayRate; + + @Override + public void run() { + + } + +public static double ExpDecayFormula ( Number halfLifeInSeconds , float t ) { + + Double decayRate = - Math.log(2) / halfLifeInSeconds.longValue() / 1000; + Double expMultiplier = Math.pow(Math.E, decayRate * t); + return expMultiplier; + + } + +public static double ExpDecayFormula2 ( double decayRate , float t ) { + decayRate = -1*decayRate; + Double expMultiplier = Math.pow(Math.E, decayRate * t); + return expMultiplier; + +} +public static double LinearDecayFormula ( Number lifeTimeInSeconds , float t ) { + + Double lifeTime = Double.valueOf(lifeTimeInSeconds.longValue()) * 1000; + + if (t < 0 || t > lifeTime ) { + Double linearMultiplier = -0.1; // explain + return linearMultiplier; + } + else { + Double linearMultiplier =(1 - t / lifeTime); + return linearMultiplier; + } +} + +public static double LogDecayFormula (long lifeTimeInSeconds , float t) { + + Double lifeTime = Double.valueOf(lifeTimeInSeconds) * 1000; + + if (t < 0 || t >= lifeTime ) { + return 0.0; + } else { + // return value + 1 - Math.pow(Math.E, Math.log(value + 1)/lifeTime*t); + return lifeTime; + } + } + +public static void main(String[] args) +{ + //Number halfLifeInSeconds = 0.1; + double decayRate = 0.5; + float t = 2 ; + +// double expmul = ExpDecayFormula ( halfLifeInSeconds , t ); + double expmul2 = ExpDecayFormula2( decayRate , t ); + System.out.print(expmul2); +} + +} diff --git a/src/main/java/edu/uc/rphash/aging/ageCentriods.java b/src/main/java/edu/uc/rphash/aging/ageCentriods.java index 775189e..665f5f8 100644 --- a/src/main/java/edu/uc/rphash/aging/ageCentriods.java +++ b/src/main/java/edu/uc/rphash/aging/ageCentriods.java @@ -1,10 +1,16 @@ package edu.uc.rphash.aging; +import java.util.List; + import edu.uc.rphash.Centroid; import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.aging.DecayPositional; public class ageCentriods implements Runnable { + + static double decayRate = 0.5 ; + static DecayPositional decay = new DecayPositional(); @Override public void run() { @@ -13,4 +19,25 @@ public void run() { } -} + public static List> ageListOfcent( List> prev ) { + + + for (int i = 0; i < prev.size(); i++) + { + + double ageMultiplier= decay.ExpDecayFormula2 ( decayRate , i ); + List tempCents = prev.get(i); + + for (int j =0 ; j > ageListOfMicroClusters( List< HashMap > Maps_OfIDAndCount ) { + + // HashMap MapOfIDAndCount1 = new HashMap<>(); + for (int i = 0; i < Maps_OfIDAndCount.size(); i++) + { + + double ageMultiplier= decay.ExpDecayFormula2 ( decayRate , i ); + HashMap MapOfIDAndCount = Maps_OfIDAndCount.get(i); + + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) { + + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + + cur_count = (int) (cur_count * ageMultiplier); + } + } + + return Maps_OfIDAndCount; + } +} diff --git a/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java b/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java index 70a31bd..89c3fda 100644 --- a/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java +++ b/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java @@ -1,34 +1,170 @@ package edu.uc.rphash.centroidTracker; +import java.util.List; + import edu.uc.rphash.Centroid; import edu.uc.rphash.Readers.RPHashObject; import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; import edu.uc.rphash.lsh.LSH; -public class trackCentroids implements Runnable { +/* +1. Check the number of previous centroids and current centroids. +2. Case i. If the previous centroids = current centroids +Compute a distance matrix ( Euclidean, Cosine ) between the two sets of centroids. +Assign each one to its closest one. + Case ii. If the previous centroids > current centroids + Compute the distance matrix between two sets. + Case a. find closest one and assign movements. Find 2nd closest ones to them and assign them merged. + + Case iii. If Previous centroids < current centroids +Compute the distance matrix between two sets. + Case a. find closest one and assign movements and declare the remaining as new. +*/ - private float[] vec; +public class trackCentroids implements Runnable { +// private float[] vec; +// private float[][] dismtx; public trackCentroids(float[] vec, LSH[] lshfuncs) { } - static float[] scale(float[] t, float s) { - float[] ret = new float[t.length]; - for (int i = 0; i < t.length; i++) { - ret[i] = s*t[i]; - } - - return ret; - } - + // This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + // This function returns the cosine dot distance. + public static float dot(float[] t, float[] u) { + float s = 0; + for (int i = 0; i < t.length; i++) { + s += t[i] * u[i]; + } + return s; + } + + /* + computes the distance matrix between the set of centroids. + + + */ + + + + public static float[][] createDistanceMatrix( List prev , List curr) { + + float[][] dismtx = new float[prev.size()][curr.size()+3] ; + int currcent=-1; + int prevcent =-1; + + for (int i = 0; i < prev.size(); i++) + { + + float mindis= distancesq(prev.get(i).centroid() , curr.get(0).centroid()); + for (int j = 0; j < curr.size(); j++) { + + dismtx[i][j]= distancesq(prev.get(i).centroid() , curr.get(j).centroid()); + if (dismtx[i][j]<= mindis) { + mindis = dismtx[i][j]; + prevcent=i; + currcent=j; + } + } + dismtx[i][curr.size()+3] = mindis; + dismtx[i][curr.size()+2] = currcent; + dismtx[i][curr.size()+1] = prevcent; + + + } + + return dismtx; + } + + + + public static float[][] createCosineDistanceMatrix( List prev , List curr) { + + float[][] dismtx = new float[prev.size()][curr.size()+3] ; + int currcent=-1; + int prevcent =-1; + + for (int i = 0; i < prev.size(); i++) + { + + float mindis= dot(prev.get(i).centroid() , curr.get(0).centroid()); + for (int j = 0; j < curr.size(); j++) { + + dismtx[i][j]= dot(prev.get(i).centroid() , curr.get(j).centroid()); + if (dismtx[i][j]<= mindis) { + mindis = dismtx[i][j]; + prevcent=i; + currcent=j; + } + } + dismtx[i][curr.size()+3] = mindis; + dismtx[i][curr.size()+2] = currcent; + dismtx[i][curr.size()+1] = prevcent; + + + } + + return dismtx; + } + + + @Override public void run() { // TODO Auto-generated method stub } + + + public static float[][] mappingcents( List prev , List curr) { + + float[][] mapping1 = new float[prev.size()][curr.size()]; + float[][] mapping2 = new float[prev.size()][curr.size()]; + + float[][] dismtx_euclid=createDistanceMatrix(prev, curr); + float[][] dismtx_cosine=createCosineDistanceMatrix(prev, curr); + + if (prev.size()==curr.size()) + { + + mapping1=dismtx_euclid; + mapping2=dismtx_cosine; + + }; + + if (prev.size()curr.size()) // centroids may have merged and formed + { + + mapping1=dismtx_euclid; + mapping2=dismtx_cosine; + + }; + return mapping1; + + } + } From 5e969ad7a2865643cf410050d79e7d8100479a6a Mon Sep 17 00:00:00 2001 From: Sayantan Date: Thu, 14 Apr 2022 13:47:22 -0400 Subject: [PATCH 23/29] updated data plotting from java using XChart --- .classpath | 1 + .settings/org.eclipse.core.resources.prefs | 2 + .../java/edu/uc/rphash/aging/ageVectors.java | 15 ---- .../rphash/tests/generators/GenerateData.java | 1 + .../java/edu/uc/rphash/tests/plotting.java | 86 +++++++++++++++++++ 5 files changed, 90 insertions(+), 15 deletions(-) create mode 100644 .settings/org.eclipse.core.resources.prefs delete mode 100644 src/main/java/edu/uc/rphash/aging/ageVectors.java create mode 100644 src/main/java/edu/uc/rphash/tests/plotting.java diff --git a/.classpath b/.classpath index ba55ddb..98c1e44 100644 --- a/.classpath +++ b/.classpath @@ -10,5 +10,6 @@ + diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..d1fb81f --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding//src/main/java/edu/uc/rphash/tests/plotting.java=UTF-8 diff --git a/src/main/java/edu/uc/rphash/aging/ageVectors.java b/src/main/java/edu/uc/rphash/aging/ageVectors.java deleted file mode 100644 index 485859e..0000000 --- a/src/main/java/edu/uc/rphash/aging/ageVectors.java +++ /dev/null @@ -1,15 +0,0 @@ -package edu.uc.rphash.aging; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Readers.RPHashObject; - - -public class ageVectors implements Runnable { - - - @Override - public void run() { - - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java b/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java index 3d643f1..d0f374e 100644 --- a/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java +++ b/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java @@ -517,6 +517,7 @@ public List getData() { return data; } + @Override public List getLabels() { // TODO Auto-generated method stub diff --git a/src/main/java/edu/uc/rphash/tests/plotting.java b/src/main/java/edu/uc/rphash/tests/plotting.java new file mode 100644 index 0000000..2bebfac --- /dev/null +++ b/src/main/java/edu/uc/rphash/tests/plotting.java @@ -0,0 +1,86 @@ +package edu.uc.rphash.tests; + +import java.util.LinkedList; +import java.util.List; +import java.util.Random; + +//import org.knowm.xchart.*; +import org.knowm.xchart.XYSeries.XYSeriesRenderStyle; +import org.knowm.xchart.style.Styler.LegendPosition; +import org.knowm.xchart.style.markers.SeriesMarkers; +import org.knowm.xchart.internal.Utils; +import org.knowm.xchart.QuickChart; +import org.knowm.xchart.SwingWrapper; +import org.knowm.xchart.XYChart; +import org.knowm.xchart.XYChartBuilder; +import org.knowm.xchart.XYSeries; + +import edu.uc.rphash.tests.generators.GenerateData; + +public class plotting { + + static Random random = new Random(); + + private static List getGaussian(int number, double mean, double std) { + + List seriesData = new LinkedList(); + for (int i = 0; i < number; i++) { + seriesData.add(mean + std * random.nextGaussian()); + } + + return seriesData; + + } + +public static void main(String[] args) { +double[] xData = new double[] { 0.0, 1.0, 2.0 }; +double[] yData = new double[] { 2.0, 1.0, 0.0 }; + +// Create Chart +XYChart chart = QuickChart.getChart("Sample Chart", "X", "Y", "y(x)", xData, yData); + +// Show it +new SwingWrapper(chart).displayChart(); + + +//Create Chart2 +XYChart chart2 = new XYChartBuilder().width(600).height(500).title("Gaussian Blobs").xAxisTitle("X").yAxisTitle("Y").build(); + +//Customize Chart2 +chart2.getStyler().setDefaultSeriesRenderStyle(XYSeriesRenderStyle.Scatter); +//chart2.getStyler().setChartTitleVisible(false); +//chart2.getStyler().setLegendPosition(LegendPosition.InsideSW); +chart2.getStyler().setMarkerSize(16); + +//Series + +int k = 10;//6; +int d = 2;//16; +int n = 10000; +float var = 1.5f; +GenerateData gen = new GenerateData(k, n/k, d, var, true, .5f); + +chart2.addSeries("Gaussian Blob 1", getGaussian(1000, 5, 1), getGaussian(1000, 5, 1)); + +XYSeries series2 = chart2.addSeries("Gaussian Blob 2", getGaussian(1000, 50, 1), getGaussian(1000, 50, 1)); + +XYSeries series3 = chart2.addSeries("Gaussian Blob 3", getGaussian(1000, 5, 1), getGaussian(1000, 50, 1)); + +XYSeries series4 = chart2.addSeries("Gaussian Blob 4", getGaussian(1000, 50, 1), getGaussian(1000, 5, 1)); + +XYSeries series5 = chart2.addSeries("Gaussian Blob 5", getGaussian(1000, 25, 1), getGaussian(1000, 25, 1)); + +//chart2.addSeries("Gaussian Blob 2", getDoubleArrayFromNumberList​(gen.getData()), getDoubleArrayFromNumberList​(gen.getData())); + +//XYSeries series = chart2.addSeries("Gaussian Blob 2", getDoubleArrayFromNumberList​(gen.getData()), getDoubleArrayFromNumberList​(gen.getData())); + +//series2.setMarker(SeriesMarkers.DIAMOND); + + +new SwingWrapper(chart2).displayChart(); + + } + + + +} \ No newline at end of file From 87e9b28f42d66a8e117909e09e5682eb380c27b4 Mon Sep 17 00:00:00 2001 From: Sayantan Date: Fri, 15 Apr 2022 22:21:29 -0400 Subject: [PATCH 24/29] creating the main classes for 1. Parameter-free Projected Adaptive Hash Stream clustering 2. Distributed-Parameter-free Projected Adaptive Hash Stream clustering 3. Parameter-free Random Projection Hash Stream clustering 4. Distributed Parameter-free Random Projection Hash Stream clustering --- .../java/edu/uc/rphash/Dis_PPAHStream.java | 371 ++++++++++++++++++ .../java/edu/uc/rphash/Dis_PRPHashStream.java | 371 ++++++++++++++++++ src/main/java/edu/uc/rphash/PPAHStream.java | 324 +++++++++++++++ .../java/edu/uc/rphash/PRPHashStream.java | 277 +++++++++++++ 4 files changed, 1343 insertions(+) create mode 100644 src/main/java/edu/uc/rphash/Dis_PPAHStream.java create mode 100644 src/main/java/edu/uc/rphash/Dis_PRPHashStream.java create mode 100644 src/main/java/edu/uc/rphash/PPAHStream.java create mode 100644 src/main/java/edu/uc/rphash/PRPHashStream.java diff --git a/src/main/java/edu/uc/rphash/Dis_PPAHStream.java b/src/main/java/edu/uc/rphash/Dis_PPAHStream.java new file mode 100644 index 0000000..7630571 --- /dev/null +++ b/src/main/java/edu/uc/rphash/Dis_PPAHStream.java @@ -0,0 +1,371 @@ +package edu.uc.rphash; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.decoders.Decoder; +import edu.uc.rphash.decoders.Spherical; +import edu.uc.rphash.frequentItemSet.ItemSet; +import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; +import edu.uc.rphash.lsh.LSH; +//import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.standardhash.HashAlgorithm; +import edu.uc.rphash.standardhash.NoHash; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +public class Dis_PPAHStream implements Clusterer { + // float variance; + + public ItemSet is; + + List labels; + HashMap labelmap; + + private int processors = 1; + + public static long mapfunc(float[] vec, LSH lshfunc) { + + return lshfunc.lshHash(vec); + + } + + public RPHashObject mapreduce1() { + + //------------This is Setup Code------------- + // create our LSH Machine + HashAlgorithm hal = new NoHash(so.getHashmod()); + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + + int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and + // round to + // integer + int k = so.getk() * logk; + is = new SimpleFrequentItemSet(k); + Decoder dec = so.getDecoderType(); + dec.setCounter(is); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + // no noise to start with + List noise = LSH.genNoiseTable( + dec.getDimensionality(), + so.getNumBlur(), + new Random(), + dec.getErrorRadius() + / (dec.getDimensionality() * dec.getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + + // add to frequent itemset the hashed Decoded randomly projected vector + + + List dat = so.getRawData(); + + //Dey + //------------------------- + //------------This is the actual map function------------- + + //this is the actual map + ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors ); + try { + forkJoinPool.submit(() -> + dat.parallelStream().map(s->mapfunc(s,lshfunc)).forEach(s->is.add(s)) + ).get(); + } catch (ExecutionException|InterruptedException e) { + e.printStackTrace(); + } + forkJoinPool.shutdown(); + + //------------------------- + + + //------------This is clean up code------------- + List topids = is.getTop(); + so.setPreviousTopID(topids); + + List topsizes = is.getCounts(); + + + // this is where the parallel reduce function would be + // to sum up the counts that correspond to hash_ids + // so very much the word count example + List countsAsFloats = new ArrayList(); + for (long ct : topsizes) + countsAsFloats.add((float) ct); + so.setCounts(countsAsFloats); + return so; + } + + public static void redFunc(float[] vec, LSH lshfunc, List noise, + List labels, List centroids) { + long[] hash = lshfunc.lshHashRadius(vec, noise); + labels.add(-1l); + // radius probe around the vector + for (Centroid cent : centroids) { + for (long h : hash) { + if (cent.ids.contains(h)) { + cent.updateVec(vec); + labels.set(labels.size() - 1, cent.id); + } + } + } + } + + public static long[] redFunc(float[] vec, LSH lshfunc, List noise) { + return lshfunc.lshHashRadius(vec, noise); + } + + /* + * This is the second phase after the top ids have been in the reduce phase + * aggregated + */ + public RPHashObject mapreduce2() { + + //------------This is Setup Code------------- + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + float[] vec = vecs.next(); + + HashAlgorithm hal = new NoHash(so.getHashmod()); + Decoder dec = so.getDecoderType(); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + + List noise = LSH.genNoiseTable( + so.getdim(), + so.getNumBlur(), + new Random(so.getRandomSeed()), + (float) (dec.getErrorRadius()) + / (float) (dec.getDimensionality() * dec + .getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + ArrayList centroids = new ArrayList(); + + for (long id : so.getPreviousTopID()) { + centroids.add(new Centroid(so.getdim(), id, -1)); + } + + //DEY + //------------------------------------------------- + //------------This is the parallel map------------- + + List dat = so.getRawData(); + + + ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors ); + try { + //parallel map + forkJoinPool.submit(() -> + dat.parallelStream().map(s->redFunc(s,lshfunc,noise)).forEach(hashes -> { + //end parallel map + + //parallel reduce + //local centroids is what would need to be implemented + // to update in parallel in each node + // currently this thing shares the centroids list, which is a bottleneck + // the reducer would need to use this to reduce centroids with the same id + // Centroid.merge(ctcent1, cent1,wcsscent1,ctcent2, cent2,wcsscent2); +// List localcentroids = centroids.stream().map(Centroid::new).collect(Centroid.toArrayList()); + for (Centroid cent : centroids) { + for (long h : hashes) + { + if (cent.ids.contains(h)) + { + cent.updateVec(vec); + } + } + } + })).get(); + } catch (InterruptedException|ExecutionException e) { + e.printStackTrace(); + } + + forkJoinPool.shutdown(); + //------------------------------------------------- + + //------------This is the cleanup code------------- + //Sequential + + Clusterer offlineclusterer = new KMeans2();//so.getOfflineClusterer(); + offlineclusterer.setData(centroids.stream().collect(Centroid.toArrayList())); + offlineclusterer.setWeights(so.getCounts()); + offlineclusterer.setK(so.getk()); + +// this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids); + so.setCentroids(offlineclusterer.getCentroids()); + return so; + } + + // 271458 + // 264779.7 + + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + private List centroids = null; + private RPHashObject so; + + public Dis_PPAHStream(List data, int k) { + so = new SimpleArrayReader(data, k); + } + +// int threads = 1; + + public Dis_PPAHStream(List data, int k, int processors) { + + this.processors = processors; + so = new SimpleArrayReader(data, k); + so.setParallel(true); + } + + public Dis_PPAHStream(List data, int k, int times, int rseed) { + so = new SimpleArrayReader(data, k); + } + + public Dis_PPAHStream(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + if (centroids == null) + run(); + return centroids; + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + + return centroids; + } + + private void run() { + mapreduce1(); + mapreduce2(); + //this.centroids = so.getCentroids(); + } + + public static void main(String[] args) { + int k = 10; + int d = 1000; + int n = 10000; + float var = 1f; + int count = 5; + System.out.printf("Decoder: %s\n", "Sphere"); + System.out.printf("ClusterVar\t"); + for (int i = 0; i < count; i++) + System.out.printf("Trial%d\t", i); + System.out.printf("RealWCSS\n"); + + for (float f = var; f < 3.01; f += .05f) { + float avgrealwcss = 0; + float avgtime = 0; + System.out.printf("%f\t", f); + for (int i = 0; i < count; i++) { + GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); + RPHashObject o = new SimpleArrayReader(gen.data, k); + Dis_PPAHStream rphit = new Dis_PPAHStream(o); +// rphit.threads = 4; + o.setDecoderType(new Spherical(32, 4, 1)); + // o.setDimparameter(31); + o.setOfflineClusterer(new KMeans2()); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + avgtime += (System.nanoTime() - startTime) / 100000000; + + // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + // gen.getData()); + + // System.out.printf("%.0f\t", + // StatTests.WCSSECentroidsFloat(centsr, gen.data)); + // System.gc(); + + } + System.out.printf("%.0f\n", avgrealwcss / count); + } + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + // TODO Auto-generated method stub + + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return true; + } +} diff --git a/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java b/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java new file mode 100644 index 0000000..75614a0 --- /dev/null +++ b/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java @@ -0,0 +1,371 @@ +package edu.uc.rphash; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.decoders.Decoder; +import edu.uc.rphash.decoders.Spherical; +import edu.uc.rphash.frequentItemSet.ItemSet; +import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; +import edu.uc.rphash.lsh.LSH; +//import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.standardhash.HashAlgorithm; +import edu.uc.rphash.standardhash.NoHash; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +public class Dis_PRPHashStream implements Clusterer { + // float variance; + + public ItemSet is; + + List labels; + HashMap labelmap; + + private int processors = 1; + + public static long mapfunc(float[] vec, LSH lshfunc) { + + return lshfunc.lshHash(vec); + + } + + public RPHashObject mapreduce1() { + + //------------This is Setup Code------------- + // create our LSH Machine + HashAlgorithm hal = new NoHash(so.getHashmod()); + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + + int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and + // round to + // integer + int k = so.getk() * logk; + is = new SimpleFrequentItemSet(k); + Decoder dec = so.getDecoderType(); + dec.setCounter(is); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + // no noise to start with + List noise = LSH.genNoiseTable( + dec.getDimensionality(), + so.getNumBlur(), + new Random(), + dec.getErrorRadius() + / (dec.getDimensionality() * dec.getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + + // add to frequent itemset the hashed Decoded randomly projected vector + + + List dat = so.getRawData(); + + //Dey + //------------------------- + //------------This is the actual map function------------- + + //this is the actual map + ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors ); + try { + forkJoinPool.submit(() -> + dat.parallelStream().map(s->mapfunc(s,lshfunc)).forEach(s->is.add(s)) + ).get(); + } catch (ExecutionException|InterruptedException e) { + e.printStackTrace(); + } + forkJoinPool.shutdown(); + + //------------------------- + + + //------------This is clean up code------------- + List topids = is.getTop(); + so.setPreviousTopID(topids); + + List topsizes = is.getCounts(); + + + // this is where the parallel reduce function would be + // to sum up the counts that correspond to hash_ids + // so very much the word count example + List countsAsFloats = new ArrayList(); + for (long ct : topsizes) + countsAsFloats.add((float) ct); + so.setCounts(countsAsFloats); + return so; + } + + public static void redFunc(float[] vec, LSH lshfunc, List noise, + List labels, List centroids) { + long[] hash = lshfunc.lshHashRadius(vec, noise); + labels.add(-1l); + // radius probe around the vector + for (Centroid cent : centroids) { + for (long h : hash) { + if (cent.ids.contains(h)) { + cent.updateVec(vec); + labels.set(labels.size() - 1, cent.id); + } + } + } + } + + public static long[] redFunc(float[] vec, LSH lshfunc, List noise) { + return lshfunc.lshHashRadius(vec, noise); + } + + /* + * This is the second phase after the top ids have been in the reduce phase + * aggregated + */ + public RPHashObject mapreduce2() { + + //------------This is Setup Code------------- + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + float[] vec = vecs.next(); + + HashAlgorithm hal = new NoHash(so.getHashmod()); + Decoder dec = so.getDecoderType(); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + + List noise = LSH.genNoiseTable( + so.getdim(), + so.getNumBlur(), + new Random(so.getRandomSeed()), + (float) (dec.getErrorRadius()) + / (float) (dec.getDimensionality() * dec + .getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + ArrayList centroids = new ArrayList(); + + for (long id : so.getPreviousTopID()) { + centroids.add(new Centroid(so.getdim(), id, -1)); + } + + //DEY + //------------------------------------------------- + //------------This is the parallel map------------- + + List dat = so.getRawData(); + + + ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors ); + try { + //parallel map + forkJoinPool.submit(() -> + dat.parallelStream().map(s->redFunc(s,lshfunc,noise)).forEach(hashes -> { + //end parallel map + + //parallel reduce + //local centroids is what would need to be implemented + // to update in parallel in each node + // currently this thing shares the centroids list, which is a bottleneck + // the reducer would need to use this to reduce centroids with the same id + // Centroid.merge(ctcent1, cent1,wcsscent1,ctcent2, cent2,wcsscent2); +// List localcentroids = centroids.stream().map(Centroid::new).collect(Centroid.toArrayList()); + for (Centroid cent : centroids) { + for (long h : hashes) + { + if (cent.ids.contains(h)) + { + cent.updateVec(vec); + } + } + } + })).get(); + } catch (InterruptedException|ExecutionException e) { + e.printStackTrace(); + } + + forkJoinPool.shutdown(); + //------------------------------------------------- + + //------------This is the cleanup code------------- + //Sequential + + Clusterer offlineclusterer = new KMeans2();//so.getOfflineClusterer(); + offlineclusterer.setData(centroids.stream().collect(Centroid.toArrayList())); + offlineclusterer.setWeights(so.getCounts()); + offlineclusterer.setK(so.getk()); + +// this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids); + so.setCentroids(offlineclusterer.getCentroids()); + return so; + } + + // 271458 + // 264779.7 + + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + private List centroids = null; + private RPHashObject so; + + public Dis_PRPHashStream(List data, int k) { + so = new SimpleArrayReader(data, k); + } + +// int threads = 1; + + public Dis_PRPHashStream(List data, int k, int processors) { + + this.processors = processors; + so = new SimpleArrayReader(data, k); + so.setParallel(true); + } + + public Dis_PRPHashStream(List data, int k, int times, int rseed) { + so = new SimpleArrayReader(data, k); + } + + public Dis_PRPHashStream(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + if (centroids == null) + run(); + return centroids; + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + + return centroids; + } + + private void run() { + mapreduce1(); + mapreduce2(); + //this.centroids = so.getCentroids(); + } + + public static void main(String[] args) { + int k = 10; + int d = 1000; + int n = 10000; + float var = 1f; + int count = 5; + System.out.printf("Decoder: %s\n", "Sphere"); + System.out.printf("ClusterVar\t"); + for (int i = 0; i < count; i++) + System.out.printf("Trial%d\t", i); + System.out.printf("RealWCSS\n"); + + for (float f = var; f < 3.01; f += .05f) { + float avgrealwcss = 0; + float avgtime = 0; + System.out.printf("%f\t", f); + for (int i = 0; i < count; i++) { + GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); + RPHashObject o = new SimpleArrayReader(gen.data, k); + Dis_PRPHashStream rphit = new Dis_PRPHashStream(o); +// rphit.threads = 4; + o.setDecoderType(new Spherical(32, 4, 1)); + // o.setDimparameter(31); + o.setOfflineClusterer(new KMeans2()); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + avgtime += (System.nanoTime() - startTime) / 100000000; + + // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + // gen.getData()); + + // System.out.printf("%.0f\t", + // StatTests.WCSSECentroidsFloat(centsr, gen.data)); + // System.gc(); + + } + System.out.printf("%.0f\n", avgrealwcss / count); + } + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + // TODO Auto-generated method stub + + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return true; + } +} diff --git a/src/main/java/edu/uc/rphash/PPAHStream.java b/src/main/java/edu/uc/rphash/PPAHStream.java new file mode 100644 index 0000000..59939cc --- /dev/null +++ b/src/main/java/edu/uc/rphash/PPAHStream.java @@ -0,0 +1,324 @@ +package edu.uc.rphash; + + + + +/* + This class will run the Parameter-free Projected Adaptive Hash Stream Clustering + */ +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; +import java.util.Map.Entry; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.generators.GenerateStreamData; + + + + +public class PPAHStream implements StreamClusterer { + + + private float[] rngvec; + private List centroids = null; + private RPHashObject so; + // #create projector matrixs + Projector projector ; + int ct=0; + int pdim = 20; + + public PPAHStream(int k, GenerateStreamData gen, int i) { + so = new SimpleArrayReader(gen,k); + projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(pdim); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + initTablesWith(); + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + /** Add vector to running Centroid + * @param cnt_1,cnt_2 + * @param x_1 + */ + public static float[] update_cent(int ct, float[] x, float[] cent){ + for(int i=0;i 0) + s += 1; + addcent(s,x); + } + return s; + } + + + /* + * ===========================MinCount Sketch======================= + */ + public static final long PRIME_MODULUS = (1L << 31) - 1; + private int depth; + private int width; + private int[][] tableS; + private float[][][] tableCent; + private long[] hashA; + + + private void initTablesWith() { + this.width = (int) Math.ceil(2 / .025); + this.depth = (int) Math.ceil(-Math.log(1 - .97) / Math.log(2)); + this.tableS = new int[depth][width]; + this.tableCent = new float[depth][width][];//we will fill these in as we need them + this.hashA = new long[depth];//hash offsets + Random r = new Random(); + for (int i = 0; i < depth; ++i) { + hashA[i] = r.nextLong(); + } + } + + private int hash(long item, int i) { + long hash = hashA[i] * item; + hash += hash >>> 32; + hash &= PRIME_MODULUS; + return (int) (hash % width); + + } + + private int count(long lshhash) { + int min = (int) tableS[0][hash(lshhash, 0)]; + for (int i = 1; i < depth; ++i) { + if (tableS[i][hash(lshhash, i)] < min) + min = (int) tableS[i][hash(lshhash, i)]; + } + return min; + } + + private float[] get_cent_sketch(long lshhash) { + int min = (int) tableS[0][hash(lshhash, 0)]; + int mini = 0; + int minhtmp = 0; + for (int i = 1; i < depth; ++i) { + int htmp = hash(lshhash, i); + if (tableS[i][hash(lshhash, i)] < min){ + mini = i; + minhtmp = htmp; + min = (int) tableS[i][htmp]; + } + } + + return tableCent[mini][minhtmp]; + } + + private void addcent(long lshhash, float[] x){ + + int htmp = hash(lshhash, 0); + int argmini = 0; + int argminhtmp = htmp; + + tableS[0][htmp] += 1; + int min = (int) tableS[0][htmp]; + + for (int i = 1; i < depth; ++i) { + htmp = hash(lshhash, i); + tableS[i][htmp] += 1; + + if (tableS[i][htmp] < min){ + min = (int) tableS[i][htmp]; + argmini = i; + argminhtmp = htmp; + } + } + + if(tableCent[argmini][argminhtmp]==null){ + tableCent[argmini][argminhtmp] = x; + } + else{ + update_cent(min, x, tableCent[argmini][argminhtmp]); + } + } + /* + * ===========================MinCount Sketch======================= + */ + + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p) { + float[] xt = p.project(x); + hashvec(xt, x); + } + + @Override + public long addVectorOnlineStep(float[] x) { + addtocounter(x, projector); + return 0; + } + + @Override + public List getCentroidsOfflineStep() { + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + HashMap densityAndID = new HashMap(); + for (Long cur_id =0l;cur_id<2<>> 1; + long parent_count = count(parent_id); + + if (2 * cur_count > parent_count) { + densityAndID.put(parent_id, 0l); + densityAndID.put(cur_id,cur_count); + } + } + + //remove keys with support less than 2 + Stream> stream = densityAndID.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + + List sortedIDList= new ArrayList<>(); + // sort and limit the list + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*1) + .forEachOrdered(x -> sortedIDList.add(x.getKey())); + + // compute centroids + List estcents = new ArrayList<>(); + for (int i = 0; i < sortedIDList.size(); i++) { + System.out.println(densityAndID.get(sortedIDList.get(i))); + if(get_cent_sketch(sortedIDList.get(i))!=null) + estcents.add(new Centroid( get_cent_sketch(sortedIDList.get(i)))); + } + + return estcents; + } + + @Override + public void shutdown() { + } + + @Override + public int getProcessors() { + return 0; + } + + @Override + public List getCentroids() { + return null; + } + + + public static void main(String[] args) throws Exception { + + int k = 20; + int d = 1000; + int interval = 10000; + float var = 1f; + + Runtime rt = Runtime.getRuntime(); + GenerateStreamData gen = new GenerateStreamData(k, d, var, 1133131); + + StreamClusterer rphit = new PPAHStream(k, gen, 1); + //StreamClusterer rphit = new RPHashStreaming(k, gen, 1); + + ArrayList vecsInThisRound = new ArrayList(); + + System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\n"); + long timestart = System.nanoTime(); + for (int i = 0; i < interval * 10; i++) { + vecsInThisRound.add(gen.generateNext()); + if (i % interval == interval - 1) { + timestart = System.nanoTime(); + for (float[] f : vecsInThisRound) { + rphit.addVectorOnlineStep(f); + } + + List cents = rphit.getCentroidsOfflineStep(); + long time = System.nanoTime() - timestart; + rt.gc(); + long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; + double wcsse = StatTests.WCSSECentroidsFloat(cents, + vecsInThisRound); + vecsInThisRound = new ArrayList(); + System.out.printf("%d\t%d\t%.4f\t%.4f\n", i, usedkB, + time / 1000000000f, wcsse); + } + } + } + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + // TODO Auto-generated method stub + return false; + } + +} diff --git a/src/main/java/edu/uc/rphash/PRPHashStream.java b/src/main/java/edu/uc/rphash/PRPHashStream.java new file mode 100644 index 0000000..462c99d --- /dev/null +++ b/src/main/java/edu/uc/rphash/PRPHashStream.java @@ -0,0 +1,277 @@ +package edu.uc.rphash; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import java.util.concurrent.TimeUnit; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.concurrent.VectorLevelConcurrency; +import edu.uc.rphash.decoders.Decoder; +import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; +//import edu.uc.rphash.frequentItemSet.KHHCountMinSketch.Tuple; +import edu.uc.rphash.lsh.LSH; +//import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.standardhash.HashAlgorithm; +import edu.uc.rphash.standardhash.MurmurHash; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.KMeansPlusPlus; +import edu.uc.rphash.tests.generators.ClusterGenerator; +import edu.uc.rphash.tests.generators.GenerateStreamData; + +public class PRPHashStream implements StreamClusterer { + public List is; + public List lshfuncs; + private StatTests vartracker; + private List> centroids = null; + private List bestcentroids = null; + private RPHashObject so; + ExecutorService executor; + private final int processors; + private int concurrentRuns; + + boolean initialized=false; + @Override + public int getProcessors() { + return processors; + } + + @Override + public long addVectorOnlineStep(final float[] vec) { + if(!initialized){ + System.out.println("Not initialized!"); + try { + Thread.sleep(100); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + for(int i = 0;i(concurrentRuns); + lshfuncs = new ArrayList(concurrentRuns); + for(int i = 0;i noise = LSH.genNoiseTable(dec.getDimensionality(), + so.getNumBlur(), r, dec.getErrorRadius() + / dec.getDimensionality()); + lshfunc[projidx] = new LSH(dec, p, hal, noise,so.getNormalize()); + } + lshfuncs.add(lshfunc); + } + initialized = true; + } + + public PRPHashStream(int k, ClusterGenerator c) { + so = new SimpleArrayReader(c, k); + if (so.getParallel()) + this.processors = Runtime.getRuntime().availableProcessors(); + else + this.processors = 1; + executor = Executors.newFixedThreadPool(this.processors ); + init(); + } + + public PRPHashStream(List data, int k) { + so = new SimpleArrayReader(data, k); + if (so.getParallel()) + this.processors = Runtime.getRuntime().availableProcessors(); + else + this.processors = 1; + executor = Executors.newFixedThreadPool(this.processors ); + init(); + } + + public PRPHashStream(RPHashObject so) { + this.so = so; + if (so.getParallel()) + this.processors = Runtime.getRuntime().availableProcessors(); + else + this.processors = 1; + executor = Executors.newFixedThreadPool(this.processors ); + init(); + } + + public PRPHashStream(int k, GenerateStreamData c, int processors) { + so = new SimpleArrayReader(c, k); + if (so.getParallel()) + this.processors = processors; + else + this.processors = 1; + executor = Executors.newFixedThreadPool(this.processors ); + init(); + } + + @Override + public List getCentroids() { + if (centroids == null) { + init(); + run(); + getCentroidsOfflineStep(); + } + return bestcentroids; + } + + public List getCentroidsOfflineStep() { + if (so.getParallel()) { + executor.shutdown(); + try { + executor.awaitTermination(10, TimeUnit.SECONDS); + } catch (InterruptedException e) { + e.printStackTrace(); + } + executor = Executors.newFixedThreadPool(this.processors); + } + + bestcentroids = new ArrayList(); +// List projIDs = new ArrayList(); +// List cents = is.getTop(); +// List counts = is.getCounts(); +// + List cents = new ArrayList(); + int i = 0; + //get rid of size one clusters that are there just because they were added to the list at the end + for (; i < is.size() ; i++) { +// if(is.get(i).count==1)break; + cents.addAll(is.get(i).getTop()); + } + + ; +// counts = counts.subList(0, i); + Clusterer offlineclusterer = new KMeansPlusPlus(); + offlineclusterer.setData(cents); + offlineclusterer.setK(so.getk()); + cents = offlineclusterer.getCentroids(); + + + +// while(centroids.size()so.getk())cents = offlineclusterer.getCentroids(); +// if(cents.size() vecs = so.getVectorIterator(); +// while (vecs.hasNext()) { +// if (so.getParallel()) { +// float[] vec = vecs.next(); +// executor.execute(new VectorLevelConcurrency(vec, lshfuncs,is,so)); +// } else { +// addVectorOnlineStep(vecs.next()); +// } +// } + } + + public List getTopIdSizes() { + return null; +// return is.getCounts(); + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + + } + + @Override + public void setRawData(List data) + { +// this.centroids = new ArrayList(data.size()); +// for(float[] f: data){ +// this.data.add(new Centroid(f,0)); +// } + } + + @Override + public void setData(List centroids) { + ArrayList data = new ArrayList(centroids.size()); + for(Centroid c : centroids)data.add(c.centroid()); + setRawData(data); + } + + + @Override + public void setK(int getk) { + + } + + @Override + public void shutdown() { + if (so.getParallel()) { + executor.shutdown(); + try { +// System.out.println("Shutting Down"); + executor.awaitTermination(1200, TimeUnit.SECONDS); + } catch (InterruptedException e) { + e.printStackTrace(); + } + executor = Executors.newFixedThreadPool(this.processors ); + } + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + @Override + public boolean setMultiRun(int runs) { + return false; + } + +} From e73aa0281cdc193961886bbfba389db8e206aec4 Mon Sep 17 00:00:00 2001 From: Sayantan Date: Wed, 27 Apr 2022 10:06:21 -0400 Subject: [PATCH 25/29] updated knee finding on jave that matches results with python library kneed. --- .classpath | 1 + .../.jython_cache/packages/jrt-fs.pkc | Bin 0 -> 767 bytes .../.jython_cache/packages/jython.pkc | Bin 0 -> 142296 bytes .../.jython_cache/packages/packages.idx | Bin 0 -> 415 bytes .../edu/uc/rphash/kneefinder/JythonTest.java | 174 +++++++----------- .../edu/uc/rphash/kneefinder/JythonTest2.java | 40 +++- .../edu/uc/rphash/kneefinder/JythonTest3.java | 58 ++++++ .../edu/uc/rphash/kneefinder/KneeLocator.py | 34 +--- .../edu/uc/rphash/kneefinder/Kneedle.java | 2 +- .../java/edu/uc/rphash/kneefinder/PyScript.py | 13 ++ 10 files changed, 182 insertions(+), 140 deletions(-) create mode 100644 src/main/java/edu/uc/rphash/kneefinder/.jython_cache/packages/jrt-fs.pkc create mode 100644 src/main/java/edu/uc/rphash/kneefinder/.jython_cache/packages/jython.pkc create mode 100644 src/main/java/edu/uc/rphash/kneefinder/.jython_cache/packages/packages.idx create mode 100644 src/main/java/edu/uc/rphash/kneefinder/JythonTest3.java create mode 100644 src/main/java/edu/uc/rphash/kneefinder/PyScript.py diff --git a/.classpath b/.classpath index 98c1e44..90af4e4 100644 --- a/.classpath +++ b/.classpath @@ -11,5 +11,6 @@ + diff --git a/src/main/java/edu/uc/rphash/kneefinder/.jython_cache/packages/jrt-fs.pkc b/src/main/java/edu/uc/rphash/kneefinder/.jython_cache/packages/jrt-fs.pkc new file mode 100644 index 0000000000000000000000000000000000000000..4ba81e6614787aa387e6091272aea7860c056dab GIT binary patch literal 767 zcmah`+iuf95MBNNl^@_4xvM5Xiux3qQc;9Z6stTSOV!pMJ7YGxtC@A0JXGQ*korOW zCtg{vZItvP{4zUd&hZ?b5x-_9*(C#U#%ilfCfzu5*6PgZrb5G+rAk7s(J{!nVP8w@ zz|_T0xwM|0^|B_AWuvvcLGyMjHyp8kHR-O98KOIXMvk(ybjeL zmMm*ULcF+rbN^C^*X=*icp#&+tZ}6(kOky+Pz6vj3@%Io z8{p(@N0toS|AOZ#_m;L#3KrljV;rYg8RZ*-xuTFFcv)tu}a!{Mf%Hcs0DiYQM4=L`wr6?!|p o?4u7MUpGG7_95~e;bs8Sa3J{C#8quKSAN;nOOKk9&O+Sy7jW|-=l}o! literal 0 HcmV?d00001 diff --git a/src/main/java/edu/uc/rphash/kneefinder/.jython_cache/packages/jython.pkc b/src/main/java/edu/uc/rphash/kneefinder/.jython_cache/packages/jython.pkc new file mode 100644 index 0000000000000000000000000000000000000000..ba677f9a1e6236cbadba4bea888d68d2f4c07b42 GIT binary patch literal 142296 zcmdqKTa08|njW<4bT^~%cs$dd=^4+Z+irShJeS=$m06clWshevGBUF=PUR&|MpT`e zqcO?Oh@F{H6}OHZk(GI5Zbn!LkSy~uSTYhIWDqhE0%R=26UIDZ2_(c5LI@e-7ajl! zfspvV|Nqx*?}(~n$-E#{WvunD%U*l!wf^CV=tTRWECdRr{A?7{s18U1HJon_gt>`rFG)wulOXnHlv zH>9)Qd^=riU0sc~=8M_wXjm?`-b!vm5)qw#%yuvkoI`Qc!?9Os== zmgSvpcQISe^5bH9-QiG=XMFJeoF0yg>niV_ch2(etQy@N7Q@?7Rp!!k zFdJL<=f|CsoPRp~ymzx&4l!|gZ*)B^#(8fxcqf0#<#BmiDxh4R&6lGYD&1FjnXA0t z?&e<&TTO$YabqFRp2t-<1czMO4U=o+8t!N0oSYweG&%WBDo zm5UttgXv&498Is8e%UU@gB7M|CPltd|JCE|$4@MK&_+$4esR|8KYG0VumQN(dAPmf zp{M6AIox^jWczuV8!Ycq!2HJNA3uH;r+rE7!7oa@&uCy(k4}m~?R`4Ev*({ae>&Xh z_Ig&!gF?Q)C`MyURUT?}$HizGPth2a5kwuu6N>IzK4}JEEdDKi#N|7ZR?t#{?L82M#7nwEZ{X8*z0UYQ$LBG>KzXNzyQZJX=7tY+iea*_S1Xf+n|VsKM# z-IWXMvaNZsAiDVE4_lYja#0MH`)4Ozf#nX;YrUyK-kcoo&jzbWIbB-GUQvw(n#RMg zM%SxFA-kiEhhZseQFk^Q=XgM?x3?ONfigOixd5ye|7?Z#_tU=i`}F4Kaf0b=pOEmtHwoElOYk;+keXmX4P9wbsC)=jBzoD5rz6cR#tDjr+yr zxUAK()wOPDZ#vV#cr`3rM8zMfu(s^vczfGF3(w8*AtH!OxPUNIzFvTSU|yPE6p$B- zVlpWf_6ZvuTCt9lLvesbIoA~gm@DnhM)+onHLtaMcKNm(EDy2vW{dk=z6)l)x0=sq zi{(cVX*s>d=B&Q?t=2*BK+-u+54+ebz}A0tzQ30lz!^cIF80R`)b*EP;ILcJ+6II;NPcD9zA#M1mH6GH07dq zGh5Buqxns_SeAFoyd6Pji_+KSiG!6M^}g>X3#oGGs%j>#*XzIf^y$uH>uTS!QH_J~ zi(*oYt-`?)V<}gYj_;p`!p^hp$DdjapO5;mgJXvWl$${;rqOhJ=b3B9?CrMYj@qH* zsJ-*VO?7AgztQWU9c)&if`&m@>`lRc=6q7G|tw&bxO**zWzW-g!M%yp; z?8l5M&HhU}q;n8Zm_42Yw|E;;Y{x_niv=jEbTOEbZsGxSy?-xpd?S z`^qC32L1GXt4EmP`n+0 zX>-DfY5D#JxGy{3Yo8u<7o%ICH!e8I`jyT_&V9#0_vWMg^av0mO?|#wj25!nxt##b z{zNno$TY@_t%c(IPyXrFY5({f^MNI_D2quh`Ob8{TE>9$k_S%=K^ljaL3rF^+0U-0 zqpvWpd!uC!YhiD7MOI`e>(FX~WOUtxBvvoZhV~fSa5RwGIdJS`Hf^M>m;J+5eo!pN z_Xl^Q<-y&c)Fp99Mts=CBjs=i1z^Xl<#Bmk48S${-aQ=rW_~iNfDb@^4e_Dz3ZaKk z0@nSyTvxE6zV1Rl#?Vn8pYe1C(l#z5r&nunI(scAZEGBj$B={moI@6NUM^RQX>Wif zL=41FK`a+K58nrL5vE8dz1~#M0DQ%v+v55S$^Xx!2ZvkdU+YRd%aihij5Jni!hrG{jiTav~V-3$3|#e*`hf zxUAQ-;tzo1UJlV2PoV76XpV5*60-~d>)p9h>{!cFdWLFBn(;DumjkjdZ zD#{sg+cKAKWNM!rn@$3qNJYE#t+5C&q5(0%PKbZ|#t6$;_A_5fx z=cKyUM0n-A`?4c67Jfl(%prV(co~aDW(OS}kEZXeQ#?ebwYupqR@JgK9o)=7JD-=g zAST@V_KWN5a_E;zuKBf7V?}`=w#wGkE>~Zb{pzNzr#u9way+`sexYrt#&XDh(UB=2eD-R zXgWL(ffz}j*i*=R49RI)-uP*kTP@|$`bm8qgVqLgmV#>|a;3E)bw~3OoWGw&mv4L+ zd<4*ZAdYU!bmAKY^=}YWurKLeWPP@EjR#%J60>QrdSL;qUF$fqFYzWrYgAirGwhbZ zl7}wD3E}Pb%d28FUN+aV6tG7a3WCn!5Of^^wm$W zNFcNUdouK6)1mSZ!%Kfbs#K9Kh=1@LM`bZ@;LNq71;xP$#m=&sbMKl?G^GCP<~_yO z$_!dRDT)GzPezfNZ+Qi)B zp*;K6vdJJwg0x|g_g2uvLp$Jl(5l&=96&$%)o3sVS79pd>PKU;+ThP|Ze@SG_xZ7Y zJIs&4V#!BB{tFF)hYYgs91IVdq-gfj5$*Ao2tE57El_mEhV;wD1cEJ4@`~_=~it^de7l>EYQ-Skx1E~${qg4F-lZWjV zIfePKHJ!@xnBu60{5z0M%r>a99JFlQ0@VeXFDAYdfX)^Wu(7M0f6mLfApy%8y~;92 z-6KhAggg#K9X!&;;}UE+jWA{lkif2guv}>l-McbUzarJ14k4QY!G`XDBcd8-XM23C z(2Cu?AWJ4eP5yNk zJ?oxuPv z2+;ML{d%+~OreiW36V|(lLeKg3-eq(_%B)uXy9*4;RGl;iiJ;mB>u?L26+?_fYxoF zLtFJ4LhGx0;Lo@YrUh05OIkO88Pp6D<=|blnm7iw(MXHe_eez5yBU?^Ax^<17~^P2 zZH$tK?>ZQ0zc6+=HIK1Ump9PO4&J4Oq_^WoB&D4*ed*{KTpOUdM}sQwU_>MnD(Hr) zC0IiqnH1WcO~?1WvV2EX4QCFMA*_pob*8F5O3N0vqXAS&T&%8VoPg-J2PK*gF@4?a zY4p97Yg&P`+Y;pxkl$8ygU*Iqto0UpbTG>*JK60*ht9!$b~`b4^!-jQGB&<1kXcn+ zm1c0I864TI#iF=ZU?Q<@;_bU94?1tAq)} zc|xFoAO%*yr!s=20oQ7?MY2JYX*2sTD3HX)tIBd`^Rx{-t#)5$t%I)5i0sJavS-oE zY})t^17N*weN~w?O7^>7R!gYF6gfZ*6m77yB-s}%lAV$L$>_<+8_5BFImrI^z2@N^ zgfn`2^{^gF5tiVMOoZcQ@hKdJAd7N|=@u|$D4;|1XrgM;RIgH~eKXmk$bq`K=y7p6 zsxCtzg>Wr~K15!5_wwBkc4{?fIuh!KAO%}sucJx!o13PEN(lSdv|YHIz0dw#Xnb)# zoog1aW+F>0E6DJi!Y~vkiiW6Jv+^2HPT$ciQtemvYaci6+$u&j!_HVpbUmAa zq>>!s!hYNWTbjRin#`UAW|>5l8b3%(*3u51Wp zuOU@MNiVR6Ko){6=*D?6MCmHYxTC@<^j4Qph?M!!jFyCXr(9m~v-Iw#%i=EAuL~vw zLMh4M1Zs?$Bx!fGfPSRJRQh3a7~zWq#p(ul;H%OK6*v%34-jOS#h^o3V(u%mEFxuj zYfzwX(&pj0`mw`^uu=5$hc#x9lVU!{i@FM*uON}5jRoFLn*<2j!Uo!@LFIed+J`}n zZbVE7j&9hi0NH^=<>FIo>hl@&nL5w-3bfwyXT9GUAZD}(y+q61fc^n84gI|y*QzX9 zOS?6lX2Wp$#o`*ru>M4w%^;QP?kNz^kC4CKPP+<=@AZx!bhKmGXIzK24EgJwXsA)? zqfLE-ta`oS#ORVXX z<)UE}=`AZZSk27}fasX}5#)2@;=zN@`v<+_R<1wz{5G2Ol=G$OA*bN5FgXAG!Ntvl1C+M zg2@m>FbFtqL0^L=q&ackgPjQI*M|{Nu!%}5lrIB~wuJY^4OX&oYVw7+jx8TkY5|&1 zX#w1u^SJa~2-2XN1^ZJv?z_#gnJxGV9Kkof?UyU{ZFl#u6T7B0gy;b)(O1Gu8xO!C z2R>Nr%V}0P#|mWuB?KnZiop5~s?0_$jwB~7wuEVi5$0+v#D(sbf7BJ}NTH2^;gF6W zF2(Y5a?mk*q^U%}v92yqE2h}gOsJ`EQtgV?U5P8lHAH9= zrb}FYZ5WVbzx5Gfc=YJm(=8Z=OzFLpxH`p#K{Gih-J!Id^Y7nJ;X^gZ4=G-86?Koo z7WNXFR;cQ3W-uhGcdc;O1PXCj!noYo96J&eBnfr-Bc=w-d4gv*yku=iAaivu4wtwW!OJEQp6nP8) zCJ=Wakqm%jK7t6C^3hNS4p{YO1hs2j32aT{lh0o_YQBDX7W!{i=$-7~WW^$YTm*CW zMqIc;gE#QeigL{%Z|0|*fMSsSoi6FXJ1N9f|TeD6>68eE;YXo?XR2q$yU=8sC~>xDkQtCl<2&&CIySm(k1`&+jy z7#u?=u0Q)=6mDTyf*ja;`w76MF1Bckder7Ct*%{TmvTjWqv_ckqF4O&ARsfsmi&tdM7yM!+wx;(g>V_h(X3DZOmjH)RPddtZY>>teu-HuLj zzN@CR_n1j2!=c)@VV_rlSj^3H65o9&CIl zurxBR{fqJJ67-?|IAG^0@Az-hxnf)S>&qwfYRYjq;U@(iLRD@BO!`7d=)uqOZvasq zEoxO1__QlNE+&`50;ruo9tg@+q}TFE$!akXm%6mAb^UQbA^~x%^?=nR$LZ;^wDA^0 zZx6t!3#APIIuHa~u#A$w9t2+yu$JM=s+}c3G`DoMRN$(M5$Sl~$og8c(&W>7&=N|_ zrT^k!KGndPys4&q%@uUq0OPL%^%umiSgg5EMMbb{#NNDTnI(N8t%34B|2ojCDZEUu z-|g3d?I;EAFLO(G4)ZT=VEv$fT}4=r-Q@h^0Kd&q z`?&}k!!)!6?*W2n`NVme`xWMA)YuA0z4TZ+@zn z_f(WY$Wf<=zk+&C1<&TEf?-dEY$75!<>De9mBgZSlYy+_&HU6b=BXgbQ-=rh(*&0> z3oM4ea~Na;@0}9+oJM#iKTR;qsiBioU=aK#;y6_-;eG`_oC+eyPjUFmw4d5$KNaDL z{N)?ur?!hvb>ngh^V4J^v3c`T*|w)}R~nQiM>~}*o1cCTnh{NO&z^FNA|typKXvpn zQnKexvB!{hdn!Nm8%C0DtDMT7$WPsFIDNGk=cl^dPi2Xp>bmBb@#%);VwXQ4-_Zqf z*$<+{ljtA@SPW+J|KKltHYogZh-54#@I@(PsUrQ)nMTWI#>pB{0rC)0Sqaao`RC{W z>%rtQ0cKO^0O;AoSk20ztk#HIYq}IADFk;EFO2Z!>3E)dNsqAfd<{x%ZnvYit+lw_ z!I3BNRe(;QFnLu zRrc)7m)y<^5Id>LE`}$fpf_L2yCycj_AnFk+)YPY*I>8c+FfN+=Z65!7_K!D`%T{5 z3fDt}Z&1XcvsDis9up8Akuy=xs(fGvQyVyg5z+Me!4HKSfbQ7CWV z5h!Ipp>CS3d);RbA0{mu5Vy_Lz>GR# zxO<3(gKRz>Tk8*F@QL%&7r6wX5J{qqst&NOclXctUbq*_z8$Trx7A{2=TY{qd*2gR zE_0tvv(B_Bg{pH)S7Mg%RHWHJ9{18Y>(d-R1`&{`K-D+_Oeq{k807_aUHGdj7&Q7P z|HJNQN4dLb`B(-V`EIL^qJ1r1J=rR+d^r z*LhS8R_XdczCxQ*>^^AxPE5xYO{lj^;S!`_&RR#m0qlZI$RwWxSre>ib_`=0_j#~{ z1CXP0=t;pi>O|;NX?A~f3w>^nJdd78Sne5_Xffs>oqUdz0?4TVoc^7`B!chpFp5*pT z+Cw1hMWiZz4V?W_gipIBwJ(>jubj?sR z+&*XolVpf`m{5toB*no`E$XD4a&#+~q${Vf9I-R1GD+0=wJqder$*!6VL~*}@?17N zxCRPf+4Ujq<@6~qzHyOqFns0=Ssag+h-q~I61|xE82}ANPuF@`@<22B;FHK_;1}xw zT)ztOtKAm#NszhX9Ssl3`Om-7yo=7%Hb zcoXy7^lLBFcs=_c9JNUY6R3P}E;;#?@Qot)?o1{tv9u${+lT2fvEZPjzDoR32ZwD|wPvWu#kg2O&iM7?m5dpmso=DfTe>~cI2|+C^KsAArOZG4JPC6$Cp2dUq zjCs6jaXA{p4GC}8fcV!bry)Sp7L${U>I1JU2V^}K2kfbWN91tEF zG3AQ*#CQ=TQ$N~%HqG9NSa%Qh!e~$Zo4B+S2TF#zz(AgPkSzjeZp1Ign!`!ELQ4|l zCSoJ0M_VJUpeFzC9}TuZl5gECC*$lpM}06TXD6IG@y?fRxOvP2u(1yY{!5WTHD28K z?R7;aAzw2_@u^9Zfyl@JNbB!@dvkwadSfwTnAZV>9Knm6x zyEu+TYebR)0c;WzD5Dwg*mczZ9lqXAy_t+-qIYvtFUD+u$hcfwK_pdW|Knbpv;_Dj z=n$C1Uok;*#2~WXh>2y+vJv_TiAyC-G#OVkh_CIuw^`k#1@*!N&B7^0%{|Dshr>6U zl7(vgv=V22I| z`5GMIzXSy{?1Zybqh(g8R_esUY8O1KMaukIrH*}u^i2Bb?i~mP0n90)2ruZV^@e&% z%Gl6mPB^!Vj&w`>Y;=eXIL|9`5XVB}y;oBi?Ab*Da`#54EjG(t&?w2Am+(2KLn_fV zrCQT#f#X~OF9lI(yzP>Y4UXJ>pkK0c)`Y0h8y82*<(w9&;pGZ9{QNV>){S;PnpH~@ zXlA@h?_QQ3^!hz`Q$w@o+XK3$a<6yH=&7X(@7D%th?Tc%-J_rOm_d)c&5r}rr9Mi^ zqdo_&ZT#yTk0wyt4E?i=WcGREj_uht;$#BXx>VdwviG~~aalm<_gQgUJf(memPh!Q zIIY~*+6P|*+4OYz@WaJP={iq~xI!W-Oo>uFeGG}p6y|2MoA53m*>8y^5K1Za(la+H zGd@nss-*u-qR3#NHe5~S+5feVzh;sa_~QsDr&}HaeB_k~7*pvHxVNG_@}9IZR9HhO zOD08h2qV(rV#&P_j@(yMxR+>1oJRG?EzeO<_-5W1+;_4|yH&2f@M^Nl`o$a$X5O(V zL5I3l21@X5RkFIbW+0RQ7hBYzbh$ae|4W(eK};DV21Iq%l%grwsA~uiA`-aT!p%|4chB=H|( z@gk1w05*tfD@hl3hdVOi-M}osHL<9qSHqY`1`6usHKZS>j%j@i$l;ozG2UCg; z5qGf4e(wm>?>>^dh`6ig0H^@+{4BshGzK?LmR$UGsn`JP43tW}H!`Wptf>c>OJ5Im zA+_g5nN7JH;*7H`6j)@r=(iASLvetGLA@2_2V?A0a`VjsL(E!`)H2_DPaG+)J|kUYtE>y@|6gAme*f>rY{6FAD3QH`<=n>b2|n zWNa^L)n6hSWZdP;cKV2yh?ba?ze(Tbw1cBR1)a5%*RMSg(OK*DaT0f5bZg^Y11W}Z zmW&6}>%^a7*Xa94`jL1dWIq#qU+M;V41e<+_Kk3{ATsz8TZh5E7-|YZ)(nCeZNuRW zZEe6V(}qw)03tASHnt=RVc<4GOP6vqm2cFB_y7VW+GUU?R5YNDx6%^WZy|m7mWa7s zA^?_qD)%<*h4=({xX8f)U_%FhoP#8^4pXf%fq7k`HKB+RHrog@>ucd)!nm;4ac0w; zTW5d?PJm3u0c7e37y*BkVFBh1{$R6t^c;F`el~nbW;Mh#k)#tU*ZW1wMGx@_uCDR3XkFGAu{RBv&ORos-?`F6u{~yx)R-HDG0=9F*(DB zxhN7GzuM>feAt=^uSBmXJPlxF9H}`l}5FWo+#(o zng*!C$jKyML6!`~m;fGy>v%{3C4+KOv@7h@ZbW5g9$~{CYWjmjy#73WjykJvgBu z9{TlzA2`J3BxJ`Zz*pf$$LKZZ&dR8cUIY~Z*eK1qR-X;urQIO+__LiguoToAUp@XT z0htr4jf%bupjGHVsD`!vK|(lEN-5XOME#&3JX$mQ04MjbF7Yl=tAr*o83L7Z)XshD zC#jgzIHsm}R_!LC*dhq>tophD1!k?o_Rbp+T3OUuk~BSpoS^}SU3UYf7YyYb-QH@s zM5yLkZ~MeU4Pafv=!fkQV57%?Pzhd z7P#giM~%rQ@~*cg3@fVv#&@s75QA&Y<2w_WSTw2mpEP)|wG{M5= zgKpiO1R5|Q8}7A#Bq0-mjuXb~2J#AIV`4n3!no!d;Q%Ez9KsmyK*&O5px@xtf+Pqw zJA8`Feu^2$ew9MFSZ>dJQ*!^k=rCV%wtn*82kR}e13?P!wN}7NIAN7I5H?>p0Mm^) zd51Y2c#VT8rDM({*Q>($=Ma$Lfi%;GR4vp6(@h}IwMwt6Qp*b>pO zCh_rH!jP>-oD4ArLRfIfZ^be*fe6u+K$Le%L1I^&fILEVQ0z)sOEg$>W3)5|=~Hf$ z5KY2nuo3P-z%oj2n8}JV$^@dLOhHK`2IT!vSq`^B8;(y|!$eV^Zp{(S?kyN(-c=9&zT^3@`bAT*|2l6??^VvG zX@oLCLZ70Xv}N~SyOW=V=IIn*@GS)rLS1%kKRns&ufbSy^9VanN6sX<-!luya-_2)e?5kxP9SXrW-)zA(X>m39^wY=BvhQmE z4U40p!WPER$oWUI3D!?`iqKDGb@kw1r_f-uZ5 ztP-5XInHjF`!B|EuZyt|uKR?a;qDF8VbZ<=&aeF(SD|{l)GBACr~_uL2W_swpewYK z$aE6<$ERRE1D6}ocxbVNB*v(u1~Mqpt4#Feq$a3_ZWM*B2l1|usO!Sf0fp7TXzi$3 zEkPi=1_aX$EDX%c65O=sLpN%kR3JHrEO!WT?(87b3(FDWX)%C74^Hb*%`r7TtWtBV z8(4;X@_mT^XA`qgfQ$DOA$9bpk`QBUAW=y;_ixEy#GGIdgYGvLpIK2NKE%O;^+gRZ zxLY4yp29@d<^gGUqHy`((kF%chq_W$UJHzRz+J&AGsN)L)=x^&Fq#4s0b5+t4&Qm1 zC0(eyRRTIJ783#BakzGr2Q2L6y-$Ugw#H5+L7Qz6J_C0tw!0OZk_geotWtpfD(%V zS5(>exP1mrqU&!%_Z#MKyEv!AAvuFX=)4gg15D3!mLCofcPH1#O#$)7=ssDUQC=bf{>%j?Syi{UM!he%WSd|CJB$DNa$e>#2KN4r|WryN(A z5oe4M`V8^glz&lat)}f78r*k=m$}OO?QZ@>FDH>s!hBFi$|Z zJ?c6H^;+t7M^U7~1zmECSnH8mM27uQ(CeL#j&Yg&3F5%B#Tf?Ct^^@gyX1AMM3Osk zE_7otGswPdA55SeAg;78LLx>(YJGO!02#oSbX}DT?Ju`qaS{GwTs(RY9SO_qpK0IV z@y;5ud)Di~pH_T^LHd+~&Qha**^)BVE1AtFSO9dRJ1J=JMu^zq%wz2K>1fC%vA@W|$=QKma1Ly|6g?>a5>`tqU%ONMVjv%VG&(xl73K`|e;$eeMg#qvYT<*&0dk_>Kba4wlz#b@1PJRo|G zd1z`2%eeZMjcU6R`*qK!PPtTP-^vqOa+wbOe**}NEsuM=QPgKhT6cla1IMWYbJM5d zMO>$<6Ne4Lo4|m;vDJ(elm>2K4)_Wj2skJ>O{{52##SGL{xnu87}aEA))cs@SW^_< zi07s6ab1h3s(UvX3flQMa|Pn#yJq~Zd`%JKOIEHg(B1Ju@{&_1*xnZlD!+6VLf(xy zK%((B#_0g60tW>;caLZnFz+J(0hM4LcUz1N!fQq9V_fqZ;q$p?_{j-Ho2Dn!SkrTY z7Gn$jsa(|vrw57a0Rvjnc-b#oXi<}>+sqm=-b|>_$**F(g#Sr=`XmSdLP6s2);N>U zargtTQMkFps@5xjYqHCpT?MC8(Y%@wG@q$(9`w8+yY7OQ0BcXEc^67m+AZun+kX6M zIy~=6=u`~8vV?3%JNA{`3xvK(Tvv@zHfjj8bKu?rHWHx?8zdf!QaSco6adNbUN5>c zJAt~P2yktXh~%E`O}vj|f*AK`{MDq$e)ssxlNR|{F0?fEQCgC4$ShJyHx9k)C|w~y zv&X_vK#ERtZG^yq(EPltu=S7u5jl;i(Ou{ffS%8Ki9tz1c8U0B39<{J@}Wcf?LLw; z5KI(l9@yloHr}gZ)Z-@|hqVvdvZvMVNX3WPcoFQN?OmLpENf#s8ty@wjyQmcEXxj4 zWN~lmyVYD2tjHx@=}iyjZc7Dq(>KFeiPHx%ID<0lg?E9$@fCF-ro3fH}voDyDecOLHi}F(wTnV>2C<8T12j3DK^a zg4jQReeO$i9UFzCa}>w}#SO0kaaAe2L6onXvJ^iKFOxGN7CaJR5pZ3M)0yx*3B~LF zecZ!vu*zHD7eVqFngwjBDrA97(w?MZ3PK!WAS6-39y20voWYH8!r!e0I@LtnAdEDI zQhJee0a8@2d^FjK>32ctT&gBjb)aC`mN&TJQrr_Llb@{+9u>dP((o_u%!2w9#q^-4 ze{b*%l-WxND*xR%0ae0N>dG8|*qD~#jd{W>s&5aLg0zcug`ln!NXWs>m~CO{8Rw)r5(qe`Nc>W{DG z<~tvKH9~#-@kK}z<6>n9+|sTX77i1{^HFvy&iD!o&=?4$J=b3cUPvUZTz?!82Q`d2 z=ehkl@DgG!oh_fTqC|76JH}DxRL%~zxDB2@&K{DTW9%9E!D&H-uFJu-71+dbPKO%D zfk``tYsZiMb$3?urXwTtb8$IUEerO*at8S<@HX9@*OU+|gFU7BblT(;&RuY}<$Qji zV^@m@#%KU?138>I*vxEId#NwHR!umjd6w~z9~YQSFlMViC&FAJUFXFRucy{Jp*t+ zn}4r6Ib0p!Ad~%t^TU-TBX>ZXV9UCL`h$~YbuB`rYweOD@&w{j#!8gq$7cwH;Rb#iByX7Pd8-`$$ns){cd+rRl6;XOoHtM}s#|0d2I|xlpN#YEezrm=Qu4#z`|6r|FK z1Qt?678r(2tBdNO-%GX?+5x+ILymL}iQ?i@zJ_D_#oDeC_%pzT#*y6f@Dyr0n6<*; zJL2>;FA||hppvh(7Zv5$+k8f{HGM$=VQ+@PG;oJ=Xm*QM(M&WK{*ze^ecA~QJp=Il z?&^EO%A$T;^$6sm6f`T>9~x~yt8Cla_1Yg-x7&|63TJ~O-hQ zw|sa%HPOVLS6ZJ)fp>Sf1K%!+ci9dkf%PghCMl`cbk?5r3N$97+y!Ao)6RXK8Y1%W zYWc^UpAxeN61yodmSP+v8FgV-2#GYVDa}DcYkeW5WGMM3j1cO}cka~lFRmuKKFik; zeI%v~_7E0+5|@YoPm$fxz(dHx2^d!xtt8Z95Dvp6VnDAb4Kmta{8;h2#8G$eGE$Xif>o2(r z>jY7=Tp4VNpoeyI4R%_F23?^oA^UYZ6t?AJCv1Z;^Rwt{QF;Lj4YNYFHQ-LA__KnK zEmrW>^CXa+KULsx{ZjFDLK(XPdmNXD0&TH^4+SO{QBIw?4jF39mkrs0#qRM6?miBF zjVr6JF&FFeBvyF8kMm%#S~!%!0+vI|s&>T9Eq%yL*#Yzd(Gk{p1lzE%xp!dT#O02w zCJ{wKxc&UadstC~ozS+oqGT9Hp~Z8MTH16&1*qjp1eEo!2?5`gFn z#A%TT_6$UXKvB&#J54O8v?wx^op438-J^{ zxSmaSC=QISF5Oeh4iY&CL|rRv(<5U6m)F%CbO05>-Z2IykiIO?SD|=cVl^gtsB6(n z1k&5w8a?kHtpTllMz7VZ8C|s;z2?5Tw>IjFZY+JXgQ48>)~q^&nQ9a2HCVM?Oh4sm z&+y_0;HWXL#kkR#(=2T)ST_wv%?}62cI{g-%y<)I9vjU9)qHe;!x{TZb*eyH&5DP( zcY&&aTFJ?a#t2S}!GJp19=4*y`c2jo3B|qMzO6?g8+*qH1G(;v)jDWX$&=pp=AsZu zJz5>SeRliN!zY;b@Ri`@yoYzEidv7aq^R+Qdb*L+#JN#Q3+Fm;R~EYO)qK&Tc^X54NZXHp$2;!+J-%tE3tkK5<+}9UFG~h;XRC>ERQ=o$XDp zvyaOWqh~>`tZytLEmUw9ru3kDgoW@M{gpu`nWPs{A_<@1o+Em4lDKbrB1^*o!(A|` z=WWK;c?G>OXCO4u6UORdaUu#7V?iWl`_W7FA@oFPW5$$|dDHu1G#(CMJ+WE&FZ$9; zSf2juUa!+shkidgA(LZ6+6`|9RrcS}YVu$?kWfSU{_D2dS<;j=&Nz~eFd^(Q6Xa)@ zZ#-9^F60AiSehy{2xevZtOv)Bfrv*#r=+uaUnBHb zVl7SfiN=%#F83SrV^e!jGPYFS9YOm7`+L|dX_MY^ND2%-f8jo0b=HxPuIU=dzBfQD z@2{}BMt56tQM7MWKN@e%fycK7)ou3ecJDRq+H?Mjf^oC^Mx>3YKwW14jnlZa+b!N$ z0nFv`tQaP^@^xkEM4Q&fwP(}I+57dt)F{JMO$;GS&9n35>AELU`E$}wJ0~oGWIyRX z>E->RyvqK8V+(L`+hw`%l046!M-Vm7*~10hCQw9hxj2m*V4}Pps4+xbjA8(2@Hop= z!v8VMJS*!^8EjlnRP2Z_6%9aeeFWGXO;L~l9Dl0G5@ybwUI+8I#zQ&w`$>-_s?Fg8 zCh-`dcAO$8p+$;lpU^we*g>GU)_B}L!7pma{jE$v9eX7pHFUuq@({$U!3B@3Bue;# zPEc=p`-d&*L_C+OROfLVOrpc1j!zk(`$xcGFxFFJ=vz{!OM>VC>or4fY`NSp@wf% zr-;2o*BV;DVtr})eug37D*@FObJ!N?0ipm=z$scF1B4Mq!;vp5CZS-q=(VzDNGHXe zkhFXfIF~5ak=jDlr(zSSYJttd$2>3IhqYpM3-#}$PF3b>r7Ge9?puOMb?@CsOhI%i zd=|Kzp|nIgZ*jHaMU@mgdFh+k)f$U{8?!#Fs`1tgu4~!1>9wZX5fja18@W!4zAF-# zD^N^T_EU1Z5+*#yO+)y}e%m43`FaO2ySi|Q@Ep)?j_=dU3%};Yhchq}jeFuZ26GR_ z+1X#w1qYmCzDxEcJXqvE67eUkfMtT2yM{;oh)zQx6Sk~1z-rad_n-jez7F@XDOqav zoT#7^Y(@jm(rv|Tg*{4=jbxBvTP|IRB*N|F$~@w1lsa9c5iUu2l8It=X0gS4s>fCZ zc_kVe6H6gvA!RPKkr`t(!`&*%2?~Q>0VOX=IjS@thJoU_?zHMjE_UWy*T}liEvvg= z*L1VV!*wXiJJbrGZi5%Ub{;>DY>h979}qN*TvEdiP*BP zG@*r3MNUlWKi(tF4hgJ$ds(T>b+BWw7BtVu02J!a7BqIkC=w=3w}>f+oD4bpU-h_m zdEDxU<6l=CE$6su0)#md2|)&por8Q@KR{R?lx18U*|fF?urg`RxHj`?^#EagAuQtt z0!Uj#AT-H-5fiqCP2s#`J9}^z>>?V?HBM}xk6eY%@>g`?;V9lkRnib^?{~Vuikni7 zL_q*LWK2IgI$A=Gd~jOb`_38nhEIe7oYcO|r(H3+=LgO&-rkQ8h*qwM+q?|bb}bxG%E*`+$;lzg;npaE6}~|a-nmrW~W8=>mNCp=kvi9 z1Y)bZ^hk~!7}l}@pM2AckSKftZxBnVn59^MQozk(g6k0 zcZ@b+*m>-e&+p!fMlz@qkBFlX5ugygHYR;>%NW@K z3!nsi_@qw~;JU*GSlZCIm>q212`{R5Ry&3|r*pW=xl7Xc(K8Wv(Y#m-Ruz-{n==U$&1u)-zYrWi#QFQM;X@ z<6`IG_D*Cj(kyclA9zl%5$pS-9KqKh{Mk_QWP2wE!{K=?hHXRMKD6Z$oef|KwVf`U z5k4V>8Z(g8*?&cBAnOBi zpa7&Lp*RjX1l%_a{*0cTbsTpc_S`QVt$A@i}5us11KK+ z3vh~nUmxS&ivbsf2q0TV*PtZyv?0M_=MIPyO}#kWIVm7^xyzBF(q#(s64-AcXE#D3 zg%BU*ct{vFzNDlR?n{yT*8_Lp0O+z+r5umQ2FP+WjMVnNFe>)@? z5zh%QXwDHXX^p4EUIB!+Gh;@m1^427#~uO|jZY%rIX)W-nvjWHpRO~h)3KS}5D^}P zJA>?>YPWj*C%Cc%D*|W}_#QsH`v)zBT-IkU!aJK1OowP1@W(DWstED$-U81v<^h9|{icXN7m!y4i*s$Ul3 zI$BT&m&XSMRbe7zoB`@&?XJ8P6jEBL694SL0bDyd`0G8_Nsr`66jKc|NeW2>7|EAXDa{s6)KU=T*1O?V_7-~^ z2K6x9qDT=s^&sAXcx(SK+GNNdybQ$$?Hb@k?`7J6fWv*?BDoER3?^u+#e@H_)$L%X ziO|+8Ry0Zn_$88AB#T&41sWKwsaQp*LGldJVIVU8300+yyb&5Gy8$oD8SpRTf3VdQ zsKbnr#9%f=29XqOeqgLecIX(kQ}EVtqd3N?p{f*he>@R=owfz(Epy#2OjdH=c>Vp9c4@oY0Fy@6T+*aL@yh0eBGTZ2<9|ak@IA zN$@kPSZ%;o0YkKBV&DZMH68>3?#M=Nj1pc&+HVSVqXLwojfpZ2RyXQO8!KN48b`X) z96l&7fuo{S2qb&mT2^Qqs8Z#!$%V+Q;wx=Y+lnkst?R|c7e*qEu_~iaI&cl_S5H%` zN4lf=$fyt=pO7c;CAhAeRqboF%!CjaA`GZj2DZf@s_bzS?17fhFJFhck5q@dUNBI6 zA{4<^aCu_jf-VvI(ad8jA#67q_n{gM^R#J3vh8r1(hw#*> zytY)+EVl5l9H>}Nj3(;O=jH&DxY?5s$1DhSHGOkYdTtS=2dAVzsY$S0A*7uH@@#-02~~|w2lgCO^5h`kPAYy zkOXs`xUf>o8z_EgXOMjkd}FqyVmf**e=mFm;zcbti3%D?m6H`vKH__UC#-P=v1lhn zTuU{(&HfcJP#2q6pHgQQpLH|vc<`N6(&u z_g^sH&`*%A#q~81{Z%@(S1TmNA$aR8!nTcqd+VQSTP$NRkPEQfk9HW6hxQ>hl?mIJ zeq&73R%t2~IJ4q$Panq;aD^lyYzZyTyO^h9V;$92bA^FB)wA!qGX@8`o<|J&?W}V* zUxtK>Keq;xD*J2g6S1JdE}y_!n>asjf;>sGrVIoRWwdFd&;SdHuc7KQ)8hKCze^Js zK2!D)zp#wMLB5`$&K1bBhN>^7ev{9krVYik$iCBQowgdB&t}zTY!tC|2N#s=4-pqa zOx}udWWWSFon>*Cw}z%#ujd(&YvAnY&aO_1w{q_s1IA)D5R`#OACU^Sr(FoWguyka za1%dShk#tEubt3Bz~8iIAXS<@iKp$ASL4aN8sL0`Yq_|5sQBGoY9pYttE&nY>4m!roU6|qU6Oj*w8MKR%AsU>h=(j+ z+s=nB6o7b9t;h#Qk7Txev#Je*&{r@UTA|l?fYJpf1>|~F;gR%vMVyno=if$Z?D7ru z?ew9wJoga;KT_xdEuDDnM%BB;@ZlQR_8Qnu1128IY1ixK8mbg*etI=Sz(AS=f|U)Q z#f|nO)i7!u;kw76P=~sq@X^Mn*9Q5aMR5EOk0I5RtSVUv(d>$pjKc`4z`da?A{ES} z6^>!-iyMWLM#r%7ZNMHiVJD+u6X@P*;}cy4*{NYjjiuWBj+;pVA`~d|oz!I8?|g_M zf?x)m23*}i$jS9fUdX~}{94z|4ZtdKdocdbSJ}^fN!Cg8@fj1J`U&!+ERbmM!FL(0 zE|S_x*3m3JFaym5qe{*?P{57KYbtswC`Z_b2BwD42z(FXYnTyXkmfjue@0>_C>%QU zFn9yZZInCd*YnYs%$U%3DTajMk&<d6S zIKuI>y1-E*qmY!0LsBvlNpUPIdCm%TWk?APm5QuJ;L$01=P{Q3N0BwVu#37h$R|iv zRLvpBwk2p=(G)4G?hD^^5KDu9O!Kz=kVIi!&1QwVMeH(bF$y0-D*u=*sMZrUz=`Q& zi*F7pxqBRUwT%~B&L`>L5E|4uAGNAwGYS1ZqNmJVf$rKH6|g#>5>+=U$qT^w!leTi zaS$wbUc%|JJ;v?undQU{B#}eu*k|)kUV``mku!AaaBwQn!-eb$yajM@OMiW>)wKk4 zC0r7&0JrC{2mOhFXNaOPTAnQ;&>OJMfZ@jhArzosz-LFdur$1N*@GYD2S0Kd2zSs9 z6-+0}Se-DJN2kH_kU zq<~y;O(NH^d>^Lt-;F9z`Y$`ZHI)9(MO5+<)N{2phxWM2eq|5Qvv9_XBH0NiaqnXR zeh=1E_?f~g@w+%n*I2NKq5beO`#Dl<`+NERo><%CM-a#6VBvTiFZJEJ9ArP(<6XVr z64Xo(LlMClFTUkT50(QQtAtsqbFA6>7Wh`|!wRt_I9c#3M>=6H2{;!amR&Uy0N~@8 zQ6!lk;S#z=Yh!!s@Q<=;lhGVOlh>J#Y>v#Q(vj?!J|q*_6fd3ANv`m3T5xNuJ`y_r z2;-$Es&R>q{*N+5dhEg)(_q<;9e3IXr@e#Cq@?LB=E9g)*>CJ0F&?EOPO!hpuOGCV zz~lfXsYpG-)<84?7uXwj_Y=zFp#!1WX3JY+$R6Q(6Y6xf@B%h1koKW={u#vMODO{T zk8t8?^%J;}q5sKTWd9#ga0)%}9o)&Zy}!b>w)+R?`io5V8(&8j1iGhg&h%5<*mpO@ zO57LT-e*4vwEOn{Gg?w0SWEVM5!(2NVy?G9f#9Z zR2d_Bi!Ql!bv3#JP87J8JBtUc>;Ye-Qs8lS48i5tLm3yCx9cE&5;V@YW!jwv5^{?RwYM>4Tez?XQ#Z(bTz-TeApoQ$gS#3O)?4-q(K$D{sj-5-8ZoSbN4xvhQ-M6}tx}->#;YdvFT9 zmyf?B3skI>@J+~RVTYkyJ`jSyQau(c>=^-IJT@(Hep7(U(jE;^owYjia5tk;YStp4tqsWwei z;j%>NRrbfDFE6q= z&C5wvAZVAVkP;O-z^b-qR4+3Y^QoY7@*tyiktf@R;UQ(-DlyJW#vvPhu$Wwt6Z?E` z&^|vogjQWyG51OL1_mnirnUf&%nld<1Lr-SyUR}WQzb`C>S%H|ftcvcWPBi&cZ+Iw zuBJ;r9Kon15g>UaJ%sM;!QX^6814hS--AX33}2GF7#0n{#s14WtkJ5xfbm_er}AR| zpaz57C9PDBFJ7G2VbccSqTOo3UhFktuX=SD?s|(~_oA#WPTtgDSBNj2RJvL;p9t7t zeW*~pr42{R3*32CgS~G6E;`3G*xTabv#$5OvgIeC6f~5|Y>|WsSFze9wn=t;35DEj zo)0MU%S&_QqAHaKs z(fE*JJsy* z&i1DWn9r+{VJ`-oG^YU-Taf(owwRtsz~uA8_M=Z9Jzv*R4~8y?KQ(~LFT{|bO!n+5 z8Hox^EF{8G#66S6Ec>s43n@hzo=p*BYlHwF6jlWwgts95@&g1^fB?hWVw{|x!7Oe8 zVJ!_GVfN-SxTzdg7Mg_-&XA!>%As|gher^%?wy=`p7h9`(4q7Nxm&fYHP_C44b3*5 zlAxFbxAAhV2PYCd=V7ov72}^DLHi5CftT&x_Szzc;WY9NeC)!3`tc%n&z~SszZfzAKx}@eakCTJmMpWbMW>aC@YdF~0A~L4krH43BZFIz- zf*!UTsT?ncfsw%Uy2}3f$RC@>0)Tm^ceV|GhNnZaOAhksv+bwD#}2@u;5HL2Pw!&0n`)e{0W5I~aN z;x(9y_ZFmZ&TK{}NH$EF> z&F8)ZFTN-`lVZ&yh_{;$7AFa~G~9xR*GSs)kwtQh^&l!&9w?V@ZhsHn8`rOTXuF10 zZ7tW8aJy4e%nPoaOu-aE>)O_eeB=n?x!9+$mM5hIM?nWw6LNBs;uDv)b#=9Hjb~!5 zLhPgHidJH*uM-3AlwL#S5}PG*Q)ITLQf^WH03?4X7w95(J9wRUzMt7rn_U9RT&#ijWwh(R7uGDzY}Ce+x~n273*nV zjdQcX>iL70#Tb$0@z>vtQ3Cd?A!d-cL9XQSgM~3m_I$I@JI%#iZ|E(cz)ck#I$T$- z!F$|eQtai&^|6<#BeA&Qnupd=n6JL3ZUcFR%Kn^K*R1)@G)0GOPn|~H3!Ax^!{SO_ z$LjqQE0&YXlGJe4Ik1#r!QEOOx7j<0U`t##Hun(S?=961XTKdCnns>tlT9G_-r{DX z206MvyjWz7)YI0T)?y_Z@&&~ z8<2XOb5Rwe3VXBRy#$#6XHK6rh%q3HGRwlR#NDe|?Dcz55uWeu5kol~9GXUX0QQS( z-iwFGVScm2+J-j~x@Z>O29s;g;$f8}H=wC*XE!B5FLNdXH?szUOUVu1_#d~143`?{M=HVpYJ$JHPC`z~xCE;jHE z=WX);5*s^Q6pL~__}vi^HPjeeKGGnouJ6GZU3``e6@Uc5%;J>14i^icyKV`wL-asB z6cdQZ*PD>!Ij{8}={tM+RXo|u{2Hg8 zq~aYz_Z|VpJ4O~H_V{Q@@lLcnlk}Q>Ksgd_y^eNd_g~M`^%(=f1n)egGdDvA z3yF1II;sxddX`*4sAOsFM{f4ve&X7|zQkvY3ZhXe)bI{!sksD&Q+5$89PpWg(Xcvc z@Ar>;x`-S_loOO>^qVIjD1N4hs0uNcBjX~75^MORYnQbVdR(e)s;0IuUODoqb0nk?nQls0I&bEpTcecd}bbI#d3UcSv2 z0wj_Zuwm#T-H0sGPvldAF%W^o1Kjr#NI*B51$MEz(OHD1SmM#jV_gaF=$4`agoe|k z-?^;WYspT;b(YNZLI%6hx1qy){Tbp!9~L~@3#AeDpt`!+*~ogHrIn0%TG)+H03G5} zjpZm~uOWvZcO}deaFfZn^4pmUf`Hl-aMc2OI+JiS!bBnCry8p`MXzYSW@Rh1)+y>8&#p%?gQfMN z-uvB2}nTPYK^Yv6YRW3d|Y`n*^B-EfrQ)CSWjUk<^#hQtZYl*O!@oNqXv)O^+ZP0zSdJoeDCP-YNZA+0R@MC^MhUn%95gnN zE4mSJ5c}5er3?=2^fe$O>?(O}7CQsI4`yvv-MlMuo*1x1hesx2v9jz8gc+{8ytFuu z`Pq9TyrA*o*TflugA~(0guWfBNB`8wtrM3iAx1q;HD0!+7RaMZT5dxaZn49Z?xWJJ z2YqSx{zPnHC9a)&GJ>d0M-&6)0+IIU7e@aGwqq}4zl@Hu7!6B!H!eP6Gb>8K|CHgY zMwf8>h9`!nL~&slG;ewP{FvG|QljoxIY@k!Ce0jxqHL4p##!M#0Z0DmW;lG>P?(Rg z&jpEzvszEb z>{j#NVXfZWkI(<8lY8~(`NO+)mi9l0wr*;AGWWvs?C;ri z($IB)3Dd};Fb|R&;40;-;p;MCFe~g9g^MgS0I?%#5Qd5{5Mv-RiwXA+)gVG@l?e5w zJlFC6Y0en=SUSLa!YUpKLw1{Fs$G2R3U0ir-Tt^e%>dVWJ@}vZK>5A`9ztjaGG`&> zl*PUXhz6ewJI2+O6UFBa+xwyDu&9>D=rHQf3mkk4tT9Zv>y5raM2{|ij*GR@oJA;$ z+ML5^Ro*&k2LUq!(MGlU#b^w-bIh0T{EKpgTi93!^eW;S69r@6f#bok#!Qaa;fQ?; zKYSFt>K|@DZI-OB52Hk!YbN#4q42K4Wxk=8R~qdMQ5P)r^f4re_IWtvjSoYh={tQz z{{gIamVZrU_!=jH*nBM2V^?tg-zsl$L{!=T;jyfoq!1Z!V!;(@7Lr9&lQh)E@(YgM zg#&bX`^1)oi3(63PAiRRA3?ofPGIDVpv;GDBGu=hL}e-0TY}bNY%i>Xj(Kn3LrV)B z=n(QC=71lfK%&98a9~m$3Tu(AGLzUGsbSLsx0N#v28guslW$V%*+)v0q)dp@A-(#{ zuz@f-2pDmLT?P@X#1j)&2+d0mA?f-Ekf?|Kvp%Z_HLdIr=|RnrA%elD;Wd)>%=7X+ zSkJPJOLm}G;mPgk4xH^$c2F$FfR3WO2wcfzzNEqjirz*m?oXS=^ieE`hHPCC3StC( zx!H&c#GcH0k(c2gHxW!j7lq}9+hU>{?wN_6!>r*8O-dxbW3m2>08teifOgCAf%IK1 zrfT-dCZtIimNiPZ1fK}x3lMa;x#-uU!zdAd4?uQoi2G6feYS3L7GRjR;yt>m;|kLx_^9->l>4t_U)PBM41gVvm!APkATtKaan2eZKgurZ4n+;4HuM_S{hK#GSL$X9|36V@K{8jAg%;9vzWU_&bl2`1_aFUHG z_b&JxH3;V3ATqg@wxNPven0hREF!%i%dQ-@;S$c}ow<$sV>9|n0b=TuF!bq zT?J&{SxfeP>tAw;u~s#~*ImGV+Ozg~a9@qaEga2BIhmp68T=|-Gb7spu-koDh7|_Z zFS>r~a50LA@c5HsCw2p+W;oq-Fkb7SV5AFEpmUE~hF26RH=C zk;)gMlk2nt(Fme;hL`;8w4(zGAe#t#BYimgMeIVl?uPE{O*G7)Llg-RXc$r66_hRr z5g9{DyFSv{SRXoKcr_U74B-lR8HmZCX-DmIXTHw!Ir?V19|pvs3{ zS8fN3;yHU;Gt9?5P1C7Kb$n2b`^n{O3}XC{(W``@1xZBq3VeVz*Z0j|11plhR@u=k zT)Vs5IzPF1`nb1TU4qipNUwh3Zh&(MUyBs4f4OJX^nW0~GSjdh( zLQ-rK!)llLC61~;#Vy@*$=C{xHk3i6AHONknZ$^5gov)%g{-ze4I)Dn-#7QAAk-KXW0z~`{DBCzRiPUrUB~viw*K7K5O#wSRBf5=Kt}HX zi&cL;@P4%V1@o3x4{$%Qfd+F9JGX(D6kYoHcN3kgkN!+Yhs*dlIgr5+DPR@t!{X8l zzLBqw6rhVoJ4s&zr7Jhv6xHE9{LK(_G5d37u#<#yIL4>fJ;f8qEKX{cfkK`n45~Mv zzJeH0Oz*N^?Na4PNjB{EEaeRME<&}Y!6ptI(nMR;lB{3$rMc$w)W>Ngf$T*5?lQP{ zAui)}4bsw;r#Ug(KGH`dp&C*F1i~Z@A^lkLvt%uv9HJjp)F??@L4vK#;~!y*z|@Qo zfu1Rm@gX6`Gx+_SLWIq6Cd9B$IimSu_TZO<*8nveio_h`vGPwLjfFny2 z?OLOAc8VfF-(ow#CIa=K&}6O2{#*F|e)8`-46G=&O~F-e*dxWhFe6(O_gaBS)}f7v zq{y|j%`FKzUPJ}NSTf(gpCY0cLOcqQgQ(tE_&^;rHkjM_12jh;i;x4M0EgI4-|%0Q zd>Ey-qx4RceiWr2N9iX~`YF?ObODe%?U5AjLc-tKz!V8_&OEv@v?)_SlE7-Z*|J}q zDBLLk%G1*8&THWctX$|fDDY;uq*rk!lRCzc z+oU^8{DCu$BJlbge&FoGoO&t(X+^9 zr7J1$;R}ToVQk$^s_K*fd~d&(<39xmWCq+3ufg`tM#C7uEDPk1;eSbE7usotMI82= zD@)CL*#jEXs-5(uws@ax?91W$`OHrXI9z+I``gsc>cKn{&36 z#KG%1Qx@3~zZ~(ai5zpq{qpw&X+xJb;>9>LWK|`M5^EYd4F}I`D5_^zkf8>YanLV! zy6%;IL(#WTMa>N=4O{{w8$?7-F)nttP_NA(Zegr2=$uI%f*6(1h{z)i11kOvjg;8H z6v7^Y2o4ksGZ<1*2YI!(65MFwl_cWuMy}l~T^M|H2!a)R{^{fF&j811l9*@#ZNyCN z7<34`N6-mOiYdqxd;<7&YgRBu;^-QD;w?<_gTI4Q5?;ubHKRU(K}HbYpy}}Hx-!~B z$aVG@t#&~^J?(TY#C59AvP(N1u2-Pt(wgBR8%9|Jy8)7%V_+>2b1{Jvv06g?QZeFj zeI7*Euomn_6HP{KoKZ;_ti(OwW8EuCJeU}8HZ9K<{r9wK@RgEzP`wk@*H~V}I?A>; zZ7@|2Fle8~(eciH58gr{pF>9b7=Jpg9Hw&buSPJ~lADm^?^BqcOv*8Aa406iKg15} z@jooKb0Dr)pFZ7r9B5EUf#TcYb!AU-TvRxkZtpzP#O-!|)MoCe&CAGmw-Q7W<+~Cj z6*&=@Gm7B>-Si}}?Nhkq2{184fWNmH62rbAKW{(5|EKx+=q-#)U~1ZHpZ|WhpZ5+9 z_VeBegrdc`g7q=YN$?0zBrr!V#t@&5=7Mo~P4ru6wDLYi1A8PGKIDB|6###9TCTl^ zr6qn}pLE;u=Vh<;2o%3{`UZ`5UkoNah)o6S`4A-DJQCPNW<7iUbw5vZ6O&#Am%WQG zdkg0kJcc9Az@cc`e0b|f`tl)fYt?eeK~I=R;mBn1J;e#0t}i3O+mWlK(;kE%F^x%p zd@l;#LYZwKM`0W#rkVkX>S5^i8AQP9Br2y#%u_nHZi0aAW;9ejc!J#rQi<*g`1H=$ z3YWia%E_+s3P)fuTH+vh6IH%2(Qz4s)#&3^%+MvX@Wqx0Fo=)KGYNDcro z`T9H~`7Vg|^2zV)4S+cOTE_wl5qla%E^-eF;*X!Shzmk}tPNW1b9IJLMhG;cU`JfI z4d(Fa-T+p!8=8ia541LyOtB+$o#*IP0Bs-2zflwcV4b^u?7m?0^I_w(aiZ#FVXjcJ z;gO+a^D}jd6BEb1!KUvU!)*Y&SPp{!++iUZ;B;9s%z%~xMwYGn$5Phllc(+$Aeh*R z!=?OHu@t6zEbj%Wca+uuO$1*6bH#E@6Rb1zSvK|w2yW4cv5 zSqzd22mtzoLwe^1jO0jfkpPg{RIe13ikg=#js~O}MbsdeJi+ZiCn5;)I45a1!Le!F zB@`x(yQEY&$>91ySgYKs&qVDN1Zg#39L4|J-M7Wam91xOpIm^16Ox=Hhn!r1L^(-F zr8DlfJ$3-m-PP@J+wOKxS9|OXLPNQ#+FkDIsyel+ZTAsGA_+G|LP8>hhe+W8BoGoW zNC*iYfFy4q9uN|d5JCch1mckwgaqIBU)K89u3h%z@Bl(H?%L~Lm%aA7{Oe!;)x*TE zer=<;um+Bys_)!b50_%sdf{kNHi(gz3C9AX0u+Ld(-_Z2*#-w#4|-F|J88Vf$AMb0 z_C;$-C|Fxu&u;^&5)Z+8U{(JaS`CK?-UzS2nVAb1(!gA@dfa&$K<4v-`5-s$mNgz7 zJa;1oM>kMlaN33Clj?(GgEi%hi8c!^+D3T$yQ(9*S*!GP9M8l8cT?H7lFikMbEUg$v(hJs+KuLJ@s zU2>-4o$|GFjZY2@JX2USN-Z2 zc)7WC^VV&+?t_F|tkvYgLm2Gh1eyT(bD;w4VN=PE4@oXIp(1%N_A-fTf7;@BJt|KY z8(Jj_mo;KV52~+%s6bRH)fE-AfB5ky@YJ=yuwbI0!Y{;X;dnqT#9a(Sj=^B#1=WeG zKSV)@oWR(b!bwN~>wGJZeu-Ok^~1;4yj`Et_k>)px0&=qv#x$NueOcP!~B?h{lgDy zC0DCI;{5-|pMLmZD+i=g=Ft8?wFBVmG`twbAfqq-oVR8a$BP;#{s7Up;DREnFqmR2 z0*1=wf=MtmlW9Z8y9I~E{M_*AlokqvvkhQXteH>7qc;Mg-i1j{-Ry|l7^=ol998{s z&1NG`9k6{)fQgMgDYjB(4GX#kGDN;mj!sF<;iv-MHN+bm7Fr9;C3Iah4juL-KOGkH zNRj4}2YV-n86r8cp0sC*lc8&kQYuq7GA%{5BF!K}fpfK{TYXY*pl0eEVpKr5*L-vZ z^+r!PiD8|C4mDmp8B^ewv_Oz zIuJqgEsYE{dy-=I!|SA3)Oje5{-TKvEI$2v!9bMw1T{DooE`jdwt>5ZA84ZkLTSu+ z-!cNnG%4)^7xTxAl0;38CDGA)BC8ymBuNv$4sJX zR!ZkKSi_HnYZ@J?M;w&*XPnWX*m2+~EGFS;5v7Jz zIcqb7i&$^SNBspHu}kYx*ipwSWL-&awbZg*yBq0dRV`%|9r>{le-2~lApcI=qbSYq z;&S36_+|XzclmGXia<4$+bw|b7r1f3*6odjL!Ws=f5;a+m@5O1Imh7m z`UqhZQPmV>#QIR)Kv{4?lE>G2pL9rrTYMS*}X{4tmI#b=5(P;DUBotl0! z6A7$z7JcxEq7GD)^HiR|S_CxiFbo%D-fF0b8uplvH29LT^@!g2qb*HO+MMW0!r$Cu z&{I&MNfnYSl9Wl?e-%vy% zW%`0dWOjI86~PXx876~XcT9WFgf$Rly(-0BeawFmuUFbuQH<{sfGz9x%I9e=xD6K1C z45LSy%4{DxmgFWkDoLpkoevqpv}BQz6+4HjGxZl!!Uy3gEyxI!6H&}Tq^x#kXcmUj z4Fd^oi~z$)1qj$T&5_k37LhWaiML{qo}Ycz;mm;k961)RJ7c&`GuLH1#|PUxlS%rR zooT|QkkU>Q+|G4RQ*0}wp#wn4A*Ly@z7Mf{gG{>)b1JE5CnY(?vNZWXs8Do*No3m! zqUE8+&65(`%1j`H12gt9#Mfd-saGSo7-0*P_8!9Bhhvw+@I^-;PE!V%J1HSN`<%)| zBbOLg7S0IqWVB2h$h)8z1}Q907`vpTK7=jm&GYy@o$*V&30wZ<8cTbCWPm6|$qnCI zfoz9lgN{OvE?F{0Rotd?zyn*;Frun`_W&yL3ti$Q$|sDel0ko5Fyj4M^TOe%acmEcob2`L~Ps^Y0xR7x~D22-o&uq_uT z%&&7Iaf$>8T0Io;R8eN1fl|d2GF%3X7V#_4tmkHMF~Frs;&Alh>LpyQP5RS|JwD6* zfg5o|RhtZYfS7eT)lvX4>Tzc)>Zq-YY;CQp*C~CD)_c;hs=f5*q#Y6LrCdiPf|qLD;g!RGNTS^vqT{!ibdR`a~;*w~Es*#*$DeEAW zqz?tM42ESxB98`o!x;cP$UA|blyb-sJ@f}*b)+_ z8{|J8FUMfMT4Cp)#yQyr?0*bn|B@)!dBt6PTLl}eMrQLv}PGA#(vT^ z5$5!Gpe;3_;meu@By>;FG5Smx4adM5>%czw!dzYogQzH~pN30M4^;WZbxYop1u$fD z39R&zJoV;`_*B@h%QJbi(K*BZdI33PN-Gl&>5~z6H(n~;#|ksG^|cw0qxxF43F={i z!fvEtE_xtB}Znb&uh>C`;8Ujx_h)_11U4HYg@ARMq0!lMD z++nJ-(<7ggS6@8C=}USz>ZM0<1Osvm`r|vNJJ6^gQjlC|Mv;L$T95=Y2##D)KE>=O z=Kx(W9NOP6Inj>*ih%%v(`Zx7lU4zgd!Q%yn>`1$L%H+Xb2Qd!$|I}Q6Wr}Urk*^6 zz9ASInOH*QLwi=BO0BsH_V_8r@ zAnk#W$WG&$^oC_g<%BY;iW1nnSnX}jD!dxiinNM-9L{lZdsqOAMEd;p&yJsK&-R(& z^YpR&ERme?o3O3qg?!Erb^|d$_}Js#CkJ6QFxfu2k`wn80c96KAC6#Gh!+zttrwWanulWXq&y!*GOL8xuCWmt4plv-i zf1S9@r)OMHp@}&KQ9}E#^jKEj{s?}gQXb;^;1nDWGNdex;ite*dJ!F37gxOjK5c{J zP^sdwe?3KQt7DN>9Yz?4kRrnd>hY|dW>r$qhwxt?(DVv_@C=sa$m+#gh8SsT+28^y z?d$Q(GSHN{n9kmS0Aa>9n^Fq?tAT`&DHiD0@k!Wo*YnH2Fk=``pySMI(JmdZ*6)$ zhR44bk0HbW66x!Zvvt=y#)LJF$NZ_9faqAJ0jV_Xy~9-DoSYYU-ttlxBglTO2A}t* za12SAIm{Xi{_K3bn9Md=4R>Bu|L$IZQ#lV(3VIT+K{(f?Nm)Se=j% zq@tm1D$}aSsL!SWS4_mflW!a)l$cTVb1jo{Rbm#LL~R2D_i)r~QK7$SS5&@&vzZTQRe8pqBsO zf~-LJFYF@d?R5D0;{^4SrtStfsMYsn4e!JsqYinDJM?)zbqGd2?snB<5GZ_T|J;?# z;)5+0%ipy=erR>T5vr3HficeJ8^7u+9tO5xlhMn)@IIG7*0Tw?^2Fpc?v$&)U(Afn z3DYtSVg=JkFe#NzR+!NmD;Y}t7+C8c!IoG(hpm(*T?E0yU<6(}_8`l{_gSNu*LL!4 zO?CsU9dY?Ej(Ik&K>x}FaJ`HKu(qTMg!KzB^N8D|gKIyh&XUL zi#NUq>elTFPZw7;7LzE9JE;y;0D4-7PrlVWu-(r@jeZztOnd}yn-BmrT2gpRJKH?B z5#|$Y6L>i*+RUn+lu11tUaZSj7LBm$yvM}yvrEyB%$;$095o|jS6tW<@n^XYU*LHC z!Y4ZtMhRUtwg4@};2~O~e#>(`Qa72vrNj`x&{U@fx`kE={(kKWOz{aXxV%=7lR%>M z7+Q%4K%Z`jG4I{LkFQ2oMN_@m8IZYMqGLPHM*WMB;X*(cn$?|$P}W3`55cX`RfH`j zjCksO@vhG+PqF2Z8cOgO!u1JgV-AOi{B&$RQe#7v^MS@uMJtGJ=5IJ)C_u3@YS1S> zRe=2ghNI`uiQ>r6hE&`@KV6>P?Eq`SX}bNS8w^eCahFY;?IZ^#6AA>i&yJ)_S(wM) z!eYyqB32DX&$H1BFm%=BL4Oiw3v17-5`Vq{S2|`}9uILalEL%@P9+-V$VlgUU>i;1 zVkz+DcQ)Nl-S{8Qwikj;Rbc2y%Ct6BCv6410ssP`nkw#zL(#|)5_vRC<;-Bgk33th zI`3(<NUHYl2^G@*vAvpCsqLLMBL{!v@Aul2cLyQ%$+b$xGzq? z{j|)6CGPRvjVDAIVVDd^q%rTST8a3(&BP)%<%hl&a|bx$Cn|M{T0#60>v*EFRR1$2 z_E>W3(Ub{n5R&82P9GH~gE0cG0 zT{kJ&vbu9#TIRXBOlm4o?4bnZx95bjC-SK z(WDNgoB||A`~e1rT(d$F_!@^d5D?7jAtZI90pj;5s2HP_1XZKvd*d^woRB&MuMzZ6 ze@_}Lk@em{KaiI?pPTThjmgnutf_6uU(jRJDamV|$@Gep z;HWc^I+zk@S8{Ms1&Iu`05Fm+#RiPSl0Gv6Iai*M+$rhyNrw$}9^vFaluYMNG1XoH zvnn5vy55?>&j=Zc`Vzo5Q~`)p5_ZGeQafp?yc$Mir58E}xVEfLMyjSW=_{-Rx3#IW z2=zwLt>Apx(?Og* zcBGZp2NKJmU<5s4fI<@8_hi`4xl9J8l5(8ox1(t?g9Bh%UtRi{?5hbc0ey^EGIWED zi*EWd9V@bn@af!s9@C<+SJswbzjCGwR1MHRoxhg^hJaH0aU!&I%0iQAz+t}$&Fk?p zeIIm9fgYh$!P7YvE1ng!A!benYvC-(3p5Z5B{` zn_q{IL|6bw1P~5d|r+%Nd47apI>3 z6Py5J@6ursX_m5Yh$9C)aynCvR~o#61+>-+jX6cU><{YdpUIt3g9O4KK?kv>nvnau zO^6i4AF5#=d?I$(l#CNBQgV2+;`2xH=SlBWe4aA#fPW5pPwstq??XtvwwQb*#hy%t zl1Zfo32>WTOF^>Ba`YaB-);Qs7G0`-y2koq1XbE96&tB(+V@Qy{e72!G=5tml6EP#C-l8 z%l0_2n;(2o{o1iUkhB3Gur9pUK@KCOa%Lghvs^40GIgMuuccMIc_3;63-HVbJfTns zw!*$GV&ZgkImZc%ST!pJ*Gt#u7vs0p@1?%I^Y9ABPbo>5%w9r{|FZfr0zGP+XFL#M zEVlI;<2Glbw|xo>>gvmIv9_;?&ckEq6j)-t0uKVAyJFDC(Dz9i<3P@sXb?oMqKU}k zW)Gd8L<=h?of(%P)lUXCT&Yw^3C9L}DdSWXt+nb;xYiJKiFUyIf#N$fVSV(_of^|P z2JL}^9&AG>>gxjq4$wZlNKZ%AzuMb5+7F_i`y;URvB3yd z(kMN-I4J{~Wv%i`!8BB!wlPWtzKU4ZKf|;pkq`;79pkmPK zD@0EN&KP7e^+rNVee8+fP}X(Fyl)mB#_tcN%Sq?phn(|aMM(-id@7>vy&1tT{nGS6~ zzb9}M+wT_Om^3dkQbs)9JijMf)RZ7mS&|R~Kfgyx6C%)rGyqaUQTMM%n~UwxvXc-} zsrnKW2AVp<>#^S@wjO+$Y%I~so<{rMNe$y8O z(1ysOw~5gmSnuufjs?M~-9TEP$9bcaCbm~a#HM+LmFrd<;E(5fqb`Ot9v5U?w`D6Md2NQkKff>?Y>|ji0P=Up)J4khWJ;nDR zi>@eDy%~eMI9-gP3k$8y0?H@a- zv)8|dkp~?2GUgl+(sT(9l&aNw6hC-mg>Pl4etPM=JY)g#&n83%_)8U84y>j=yLrbR5Fw8FzDD_+Mg)6)Z>;l; zO^*l}GW9Yw!U4|(S4Q8bC(k3h7Ubzn!o-lSFmf5b04O!e;(Z+_$-7-^JBU8-%M)8( zH#4O1##`YeGyHAu)*aWn1zv!I98nyBVQp8JF5yu7WhcQUvva^pE*3g>PmvJKfrB}eT+wVO!-66J z@|qk>+{8RtxKF!67jOJox@CCjk<)No(9u;qWDQhY;%&uuK7x6tT<37(5c@yUoK)X{ zjjPpDNvDD&7aRkWQLsLXGmD4dT&6k?`}ky09efCsB)UgCK-jb}RF_@@PjU+JiL)sI z;HVI#y%h&bs;@+kMSLB+tAHSanQwlIZo$+WN$j6Oj9?BL_vAbPJ&wEsuf&f@M}Oo? z^5ZdmhiwZA19NSV+~=9%nP~zQagB<#=d`qjXo4{X?mZyt_Nf3kTYw*3<63cl^RoA~ zh1`uU1s4TzjGUH7$8k(e|?F)$sM8zH#2|9`D(8+!0d%3FXjI zBGr)sd!KZBTqa9855gSg1D82v%5ESX=I}hliYp7aE-VW)Iu=0by;ej>9nGf-El5Lq z5l{sL)yqcGKULL3S|ANeOC7HS!5+D`x%Ds_HsI zNW|nl1@W{k6I@jf0-YgNACCHF3yM37kMoGlYT~Cww_}O*&Qq85{KhBdMTo~b$c)Q* zzVGMJ;RNcP4C{j>sdo-cK(?m@TcZT$6nzKQqOBs$%*zar-D>++Y!AePN#+_*U5I9H zNJNt!BzY3COj`$2?|t_)0byPHhvFpHof66_q@}RTnov$MNRnJvjC--r2`|vj_~zXy ztQv{Us{_O8^r3H$y^sA#^q+#hmvdI~%u7g-^=GOFi?ZU8G6+$qb)E{5$ys9k+mz%Mlpkl9&v2z$iA z1v)++n65O}D3o(9#W(MMy~H?xc|iw1%1=x@VJ>Krb%$4r%c})bTgu)zgBx3=05{J2 zZ(x` zqp_AOlJiy)hH;bSc~?RU<9eQ}$7gRUWU~qJhZSIcp~;ODyq$(>DP$L;8$XkJ$7E8c z;ROxT=o|=U$+I6oF+m2T=GwJRGO^zvVZ^kXTBbIY)5RN#1S!$iaOBj-l;I9237!q& z9H^0|uQn;57kzc2-+2cA@Zf_Q+eto*COaT*<$e>Mc^8%poIgo1TRC(j{^z41Vph+w z8sV|vr^iGPKNi+&fgpryRhtB&`eibyXjLwb9s)y3S*Xrz7dwjg7xQ6fqifsRKJ0-? zp3G(!VD25znWr<~W1q`=059Bw4NyRs?CLgYU@_^H(>R75;b72tq!_^1Mb=6dLabs$ zg!4UJF01c3Iz4?i@3*MOi^0YcL_zhBWQB`eJC)u*cS4t(9?W@MqeHgfd3uINoDjfu zxH|*NKk9`hAB%)(YuWxw3Mx?OqFjcnVK#@#*cA-^a8N4CO7itRPp1QtDO_6dmQww| z8j;5t6dNThCb z5d{Due$e_|*cZQ}?}F=;#k#3T_TX$SW_k83hXf22VA_ zm$;OHOx~c!AP_+as@GqUlqg@*?8c&d*D27NUF5*A4* z7$F8scVf zmDr0Vl(Z=hqb5*=V~-AO*u6#QR+JEbogB)*uPJvjrR3qp~i2ChmR zF<2K~5$r*^K=|vV4WJ8Q&t<)qZ4MaM5*#k#wNfhIIWkP|RzM=(8}_*^xi^=SFYG)! ze7xh5RX$4`@d%g>;0+=GcH{H-WlY6l5aA3%-iouE4qmo5izUpooEZVUe;E>j7;A*Y zRF_CB`cZ6yOFiVd-yD`ZxQtzuJrhcjl^T*qkTZtllc^xt2$?TmE@oFnAu+TJWsc)B zf=ELcJ&C;of`1)~fHK6sKMxS4-wMF;bw3WWSSH2-a`OoQ5g)~L?0~cRVm|g?e8Fi< zJPie(T}~{uUKa@(OjBv)XvnV7l0 z8NJx(*Ox?|M4~tT@($E72FfkL&Q|8pXyHPQo#}L@7G)spgUB+cK(Y34PuzbEtf9tM zoV=8ucAB?H%Qj>1Ip|^U}V2}&9*`7@3lWZDH60kGh~l!4?jE7N4? zHm=9`A7ML%W`=7#*jkHp5qA=3Uc7mL?vd+76&dPX;vm2+2hOWcBsg=iqKmH-$`)4q z)qQBlB&VzhSdi zev6+33^+wU)t@7uYjHV#%X^M977nk$z>0t;U=zZvQz(Ly5r}mJmkSXJ;p)NATR|HX zbExY=2R@w-A+HdH@ldg#oQrWaY1hT*vaWt|zk9GxQWOLL492KShHcS7Ai^fB+E_Pw zO+tAn1ZMS{yIrGLl;SkES1;TdR%g##U3J{46-Y!npX$KZ6#m`9KLm_*=TZK2=L36U zT-xL+ssm?EH~9zdq2vddNNvfVy8?n6C`6eBgif5)eN!hnZZfJk;c5^oat(d2bfw<9 z_d%Oy%kOPnBR9HxyF2r-2(rRlK@B9W6yAH89=V#JC>T;z3VSe=>pa}s?ZQV^t|^eM z5&{>!u`Ly@ZEnU|_*8^Q=N@9<67be^_abq%5Qg=&$YSA}Gy#YV#V;e&^A@Dx0;KHC}>K_|;FlGQ| zUKo0(E>g&kPR9$xx`}6^EGik~$52pTm=|7JJ^NGKlqS^bNni3{xtziWIW#=Mixf0RBeND1-XmWk0JVt@)`YM&!n6RRx+ovp5k??B82G51{OMH(v;l8+tkw>DPK)y&MEv&!6f7u2|X;VC|4$_=svT$`ZxS&*` z4(Pz>bIx<$AQ>Vh)+^Z4zJOXhq~5C^hGjq^Z~aQ#F(Am5Xf9CD39g+mfP4)E&&$hd zw|BUCjJ5d5O(5~tSm~Zr&6qOj7p4RiX-{F3QoOE7Mg;$L2)a`JlYBNOij@4V-7bg~ zqX@f;CCHWnl+%ejm$*`Jl0jBj8XPtUv!No5`hiP_R6#zo#&NG#)IfKHH}U-5*3Fnv z5V|dEWca%J4{#f|_ld8(^psaL&+R3&U{)cvHfX(}PFlPuS_99(&zNQ|s$GTF=&<~< zpI)r0S2*h-U4`y@UVm7_;&k$PPnw7BJH~+bB~mu;4rX5649H>K*6t_%-(WtTRiAY6 zw?MZ6Q0(nouH4*!qtCV2Mr^b4$_Y@c#WwzL4Sk#`Y}sbJ-JU*TC*-RLR$Tz6#THip zEd8Vj)GL=&0dFp_>O1@bsaMe(|H~zcoML7*QaF)=9s~swa3TTlUHPG8ON}nIpc%LZ#egUswgiJ}|>r6(KKXmudjOBqX6A z4S9L7&glW0?y26ef{ zRMwG|WxH+%4bpj7Z@h$e((3Rd{LP11+%;uRv2mD{#t`~MNU{rc^;4f?7tevHp`tyh z7Cb9OZ4oMQ)L{My5`PjR1NsONGYKH36g<@ZsUh0Hy3%WNyl$aSP-5Fx{kQY#-@89f zBSj7AiioI@C9sD(&yO+1ia|O(q(tXCR!cxua0|YSGo$KJhs`CFE&FH+LRZ2oT=qjo zBo9wVBUtKq=P;xgNa@?tDMlLJe3W`Hwm>lerDDSQ@^Vsr{{#HpuW5s~sJ9CV{EvjW zEqRPJ&9}&JsXjaRGON;(3-LmGV)1lL=M zR43Y~&~InxyaY?-xtJg_1Xsyi;TXVXD8pqNrcz8f1xi)}E^YKq)H+f-lzOccScT~)S*@W}AP#|^6gBfsW-rE5zOvbtp5(D-)rSGi7S!iE zZbC0Hiq&6Ry$U7XJ#Qqd*|px3jM4f*LA54eqFGDgwC&E5a68)FW&=W(yxV&+B;d2E z*Tso*2nCzbf{Y$ve9D+o+yl$8ohK`6x$}6h=YcmglMq~FDZmkS!|x9RSzzkwGalIx z^Wz)nTv49wyizu##mGiJ2&~Sd@nmuW#7#42T%OTBJYdk=7u{W4&BHb7S@+p^2(87a zL|-1VSOe8tuFf6sX<)b{OV~__L3)!5rEC<~)#deqbtG>CWKWEfxniKBj(A#fZei=g zwbRnh?GChD+mG<5kXEcMWB!vgyR3irFHt~=o#PT7*wM|V(P()mIgyuTU6M9i;B3W{fi{4ESpm6YNP^S5GLkITmTAh z8~NVOnjW4$y>;`}?a=amcke;VXDhh*;ef8%>L<*V7w~>}-PB4>@p^8pVN0_aI~K*0 z_Ev)||` zuA7i%q4ZWL0L1#OYcLlUuT6U@_!sp7bK_K&a9MzVSQ+r$XhRkn zWKV=qE*;XRyc>%|X9BDGl{NTWyl$+UM9poTBFk#oD9?dE);72f>#$NA(s|yB-38hh zje>|OSv7B`V=(@&B;K9SV_bumkbr`z?F_e%g+u~pBuhgr0*jQGE^(h9C0h}4P)_&! zmL2&i7FMq%D}boR1fdFFz)Jw-j@dGqP`zz;p2N_^dF}=0qa?;ag8hN+VGlPl5Ovnw z4gnLmL&r|Il!U7`iONx4y6FXL3!$TIENh_RyyKm*4QTDIse*M0Lb$b}ip$nlFLHQ& zT;S4~J5oa4Q8;&@tkUv+j3mUW?5&L{uL-%nihXhylJct$P5~&olyVwS_%MRNEsZXW zoHAg2(cGn?LdzGW{IW4$V*FI_#AsqjjL}F^BrMJX8gSmm30l=XhI+cQ^U_k}HY?tq zirApwOiP8qSfMBxjk8o9n4DT^N(<#7$wgbnLo-*~u!wAju$-nIUa>{AnF7Q+Rhl$O zXd!kSH%#bX!^KWk#D?025D~czV_pK4}EaJ_zpv2q|wfqKow%fV&!FCk+a4<3EgJ}aRSk{sat733FM{)w!fP}WI$R0O}69yq0P*D29Z^K@~jY34a#&k3c zNx*0v;y^bLN$F%Hk|1tX;DBH(;-D3Sk(A*E9~q!A2|aB}H!0}`Bo1g8kvKRRk~kn3 zlQ_r?N>Vx*l_Us7xQ!k_-tNXFUMv`xIH)u-Nzl^JB!F=UHQwU}Ctirk)Y0f9!L#9s zgCGk8N$SQY4(*NT>a_6|SiWZ?lvEPD6*mY;uxO0pKw0RG7Ej$MB^5INGtF|rI3*Qr zE+sco@dDXUg^=~TN$rU1#kl(3qaOTvid{***VOM&Z=*0hsJGqtZ@dGE7Lf~HpZClQ z2~GA%kHahvrsFP!O65CP^)B4@8y7&hW@FqOpO z>JBF?BRKGYU=q;*Nhek?=G7GSQW5YRj%%SdnLx+aMn<*3TqbW>h~qeZNwb?q+7?+A zO=TcEfrrg=9S4|24Pk%}pE&h7wkb}58-J2tfexZmhZ*cjuC1A%zNs%u5fb&@b}8zE z+jOf9hM$upMvQyAt2wQ;T8cXe9rm6DkoPBqkM!K|%8=Xo+BP7~dFWq51!KH93W4Lk zGBQ{R7HGC#!kfwTWhz;9TwQH$gVOMm4u4d4Zo%@l#o#Fl5F3mjsNPNBW2O3AM%j#d z16;ThGsv{*JI@ca+|8SG>}R4r2Rc^q^va(1jJ3_?M&G04_!I9L1h^k=8-OndNv1Ai z7}!Ifk?A4Q9kOaJ>Np!3pZ25~2b9I&DdN)7}*W%ur+1iwV~E1dpH{M9%Bm zSICicel@+IFuDl~Y3R4laXJJ@dO;p3Hns}3I)0pZPkQKPD%|P4p+aWrGZea{Ex0q% z2dkCeL>nrq0_QV*kPGr*cyV|;N0dXQ8^&o=2LPMORD!A{akNb zQ{|8uY-O1#JoHA|hFL~2&swT&Dq>D-rLk^+3%L6o+SZa1DJuiq0Iv+MPv+_l@|^Nh z9I9QQC%F-P>x2Kc0NLy#_;k7$0WM6*4C6wyl+0QDipF};EDlH`L!FzZ9b?IruQGVq z*`kl{;?57MAMPG@)3wp)KR7`EsU}8P1>I4y_SL2607lv4C5@Qs>PM8T08vDJj*bFO zIfKG=I)NocdL}$CAR8M<=Gs{vXJytm?1?<{SixHDFN!i67b;+wHtaoWbMS< z))Zxy2pzzL4J3@zhxAbwV3f_UL_BIVExz(4kLr=~VALn`n^zvqA1vKPZ8IH)qBz3X z7W42;W&a_HBcTMN85Z$)xJ4qgNe^pUZW*Csd{_c}@J{<6kB8xlA|O;96|1@vbmB$jK}-BpP`kxvp4#MKPXeD~(lnE?V&dxd{%Rl=jg>B8~?$ zV>;Ia6uK?#V*@b#%!_I-%j7-4glz#lre$vnT|jARGxgTia(K2b0Ah`Z9rvRvU{=XOZo#=?se3d+KlU+lB-AeUKL{U{XXjCDgYceR)*6v86M^&a=C+_X;) z+wkW+djXhR;yPSus-AS`jH8Kt8uF?{WX@v;?o+s|WSX zT_2KpdIG`X6mE$?%;_D_JaA&P*ctud>0UTUEY6_8WsO)FH_mnk;Hb;Z8j6kTvS zXDr632&A$9C~@GEfC51(qHwB_EG0S;i=z|-vHYkhPby>^y91ZPw7%RkV~18?&2FfI ziS(qk(U70oZD}KmOk5&jgSCxu4)SGhFfWQ zEhj&00|t7}cY56eP(}SAFwJ8S6VscPIP6a-WYW8hub6x^KBX^bsf7)C^G<%gtB>nr zXmRzZqVzXEYx&&H!$DtJxU&!;hd_#8DACH#apHa56}gRZ)~aL$=BY?Ht!SVu)6$Il zn{4YUqQ{vQNM^-in*ec_u*lPQJ(~M@plrEPk z3!l=fh^+N@yebz;Q<$2RzT^V$g|BwO1kCSvzm-_?@}_`Vdej8UN}_^!DG_h$&HF~f z#%pbF8aRxjyn@8Iu3NxyiL->`s`X>zCE9!Axy7uk0Lwx6bTWRy3mT3%`W0ak@qodm z%`D4|!HkiLZHV6PzlFZ~>Ka(~zxZ;7za?MoB zE6k0(LZjH@QU6S`8t_*ALbltc5J>pRY=&4a69(6GobhFk^O^&oA8 zpQVZ^C5!_p`bIR@(GmPcK7c~^>X(B)l+&BJ2k^*v@SG(OoalTT`?h-ms@NRdbdL8$ zb3lADGlPzD?8+(!;mPBe%6eA;nm$En%byOENwGl(sJsCo&v}qd*45t;{Rmkk64tUd z!4@_KTyj2+sr+*IEJMihQBn3?xqi-KP{f;hXcgNBbH?F0sOu~7%W3XEwsR?+pBMqN z>9!uu7l}0MY7f$mg*ajo<1^sLBuo$caPI=@M9C3{OJZOkpTol6SCnep@{< zRGfq&7(WWZG|amw#B%&n=;Q=}AW`IZz8dsjA$;i-Z#MrR z5oj4NN_E%(i<8CCu-L_V^tg;#y&x2@mV~xvk}AdS3gUFwA!jY-ciHQ4$ojX%mep_e z+1WV9E*v;nS%1@)p?nW&UD+9;HJUyS<7vdFmT|Sxbp_P$;hk;VpUnEpcb4D*3;xAB z%h1LA*5-SQ+`0A6A~0ZEtnUls-;S_-NjupYn#Th|(}iDd*1aQ^X=y@@x_y9|!r7JI z@a|6ptpJ9`=Pk2-9CADP%@(9#{rFme^{pxgMMcLgU+&p*;a{7t+7Y>1s0G)L>jqnM zck`rWZz0VlLLqm~k&0|<-?-xnov|g{28A%nKcE-1r5|RVq%XhqL?qDv)od)^#fe^+$vP!!by2vd!kxIsT4j#5>db^794P+&N`HdZ*C8P>h6cDB*!Y zqe5LuHl=91o7V=ao!}C2&3lcsV#ibhua=3!0x~^Z_U5rHXl1U`bS%qLJg0mLj zS4*hdg$tFk91{l(;auL>dhj_$lt%F~zWS2(x%wQF)VsdC6peslI3m=><-Pt$udvB( zeA+hM2rdT~?dDD1X-+kW!duh5&vU>C#i)y3FCLS7TU#Dp>LHAGAWs?%(;GVm!ag$1 zUEED)mlE^3*9!zW(LsVbSuViiT^Q|cV$^AVA_;$nX&(+_vGGNpy%QP2t8#*Rn-1v7 zaFul`>Clve8dX0XEoP>Y0V=n=5;7*C$MV|1_YbPuO4(zo={1A#2S2_V zT_J+$R0%TK%KdXN_Vv)B*H9=x{9wD|Ayb6=jc?rjkVIPs!|W2uir9`g-lq2_lmQ+? znHh>W_4w`mKD^>1+!f|k;>+LfUyk96{N(WgtcdaF;Iz{_zgiAwZ>GvFnZT_qa5n#p zmXG1^RRJRj#EJWE@4Dt%>zsCv;rMvGlou9?(#1M0j5TPh@ppw60b2?X7Bi~UjzxXX zq1k|=vr?w8?_jpzd^MxEd9hMlGj1^>w5rfM?clp*;cv)k=MYXi|6)2^(|$|GH5pT@ z+2*OH$suFfPsI)rM&aT~zC9nQYgk}~qH-}YWNLlkZ{|ZJvvy3Suq{j2Hz2T4n2fHI z(d+T(4T47`P8T>|j9)Y)JIokDX#^mi3%LP&H0xar&H=z7lbcegAgv5!yX+Wy;OAas zy?Kwy@MmR;b){JsKCNa8zi2sfhTUpu9f9$pJ*ORNjgm{L4bR{tP+T`k`7j`43JOfh zty8qbzQI8XpSfFkTWt7Hur)d#buP>P}#J$V0hvv^O4TsN^HM z#8en#bWrh9Q{;l_6J!?f&aBU;P-^r?X-MKOYu?l8#T2NKiRyMZ`iiaeN_k`HEDj!g z!TJ96wVORgbRajpo45K$hf3#j_Q3uReC!Tfb!c8PH2nq{;`?2WZe2tJTO%h3fvH~g z7u6p{9P7d@ZB9pBiwuogAq7K0-By8S6eYLn*XTr$lyRU)Kwl$p63*DOSr^eW!V6AJ z?mm0jIhsu+_6QbJC?FPwqEm9bfUj@(#Gtw#qGZ^K_W+bPoRN~c!IkJSL@D4sQLCRo zc(4-lH|oOB;Z(hzqcOk&-xyrZtN#lBP!H8kX1_P-SxDoo%wwP^$;fg z#;Z9a6aM_es+dn(RRjcbMUu5nL$n^%lN1%<*NdOF~EGTKBdH7xp3>QM*G zZs04L>1gHj*3R?pWQYUm>P6@I8;GtiCcIzHzE^wL#8}nZS9uqqWbga)I;9dEVQ-M z0);MaU=XzNasmHt)o+=7kgm~~$g>xr_pc;8&nhTMVl2x>-3^T_$p{7FEjl)UYyBnM zEJEW9i&3cNmO}R8PiBZls9zvmR&ZeoS-1?`NI3YbKs@vv2-HP;MNbK?4HHxSo>vI6 zA{W;izr8c1OF|S!cm@wAO$-WD4#$1mKzrjpB+uG&AXMC+4o{u{I#_N#L_1PgZ&yZs z$~p!mDLd~Bes2)+cLt)LB~9>iFvW5m>lzNvJJ_u$V>c$cF|pSXYWC*ctu1@H_u-~K zZEoGWNs>sgl5|aqZr-|mTT&07fWIL)NGJ+^=#fN^wZ7xB_4axX8jaM?Frz>sWcc$% zSzOPqan2k}(}zREVw50njDrg{e>R!D5i?GCBepI-pU-ac1rLoYW3;_VHYLV zqhh?wC|6n#szW@%5J$xhCIvHH!+VWpG4i84d&IgT;TV|bCc>p)yPShaB94TO7RX9q zW^=@4eBsd$Am)U~khn1O!0S&O;Zllkbd85F0Ys@XIUD7godYd)j-TK^a5vdDpGl~J zd{|A&#^DkuLqX!oJ$(sA&D9HO8M9oAaw%-G#4Pu7^DgVr;9I-k-X25q=ES2^cv9Jy zDOZxty-ZMHUSitO#GMZw#U_|6a*G$-`QYScctJ`QFCesRVpG_r{=SpVvI$K#o7fCZ z+}kYr$Yf|DC5t8qEt)7kr|tO zq8A9O5atW|RdFnGvS@;ZLN9LLDSE+V=tW94nuxuCB3EoelDUbTj77#Z2V5_xh0Cm?`(x>KE26g==q)UyO=C<)EUFiMyIb__ z)(4?)o`{VD7W*d2+&Gf4aY^Rf+aHFd=!w`kV6kyY7S&_p$hA@ZhL{|~^XgYtnH-60 zL>|U!wlh3~d=Wg5;3xS1gwe4LCyeAA#TZYeikE2cLkpMstkAQ0Tm2c>IQ40nIECyQ z7a@AxB!%TVd?)*IJ4C2v1d9ZlQq*}tJ#2SzUj&aAP1}_f6u=~`Q3qVk4b{%D_&adOMT}YzZm8eFp09w?zP#px4xVS2cLsNNc z^X>yY{2#;vsNq)qB5*5G{(63m(5~YeGB||2I(T2m)Ru)aTn;y2raA&s z=(>BjGlh`;5P|71@7+UD8txt*A)Ah|K$4QHghyP{SwJ~GU_g$P?$nFi{hgT6IUGY7 zt$?VcEP|?^4Y%!L?q9uwAvt+TpUBl;UEPtNZ^OFF<0R83=d$4j*;E#|@hwaC4#;AY z>@x?crowLZ-QkNwx;UQQ_$mgrzqmel;)*6J^~25Z30XC!1jHl;4USek+g$9Xq%sRV zX}XkT-Mj<~LB(AVn$b5TZf=9ccn;+YL{~x@V0#x?gx-R+A3@eA6i}>uPlDE_##bKX z15{Rsm(a_GRn*6D-L_001FR*A*!z<-EQRc{bOpn!WcA%pITXDZT$fDXU;XrZymc6- z=e@>J#p09A^>TVuea}ZnPggF0>o1hXHH1~wKUV(s!IOY;#&cZh<9KM9f|;ZGS+S5p zib579POIxM=hr}gCRRtXsH@VqSVAGOQ1xA54UZt>tVh+aaGrTN_G>F;pOBE(xXXh- zLJmHs%KCZ9OI+_33Caf9s*^3qED<(wI^x?UxSN0pY*SJp5sjh z#DeP!N5cWflPknT6Hw@Je)O&qA{@Wh>CF+88lWw|k!$JL0?Rb^xJae;wYPGNLY03H zy&?<9K5`!dB0>51KI4t1pjTYb;?j7w&sAFx0qmwx7d29|rtssJ! zy~-FoWhxP%%KfpVoSpf7kagu(E8Edp(FB$#bTkmBx}$?`nVJw96bwk2juQr`wdY=+ zI#iJQPd@HZJ=m{6!Q^Z_h{f5R-l&gIn`tAcCV-8UQd)U! zn){RdExkF|crKjLFW}b1p1~27-YxzTAa?{WH9ZQ+pHYTpXkV;U#uB<@MO_6^+b>5r z@4hE9`>KQ)`gFa8g78LgTB%TG<6oSzzN3 z>^a~GGjueulF8V`^wZFQs8p{6UcMLr%f_baCS*#a)akyuh@apahy*DdaP`i| zaIsI6j*DHh|AxbDE#xnW?PVzTNnhkR<=|*|rbg3{#_1qVun{202(#!1xZwtd^M(Bj z&Wx3kkIvzgA@;>3uO)0&Bxlf48juL%A)FC}sDWI83=2KUx?%b5>Own5L+JMPX7~&=oGj%?`f_zm`EWe4XLPYD`86ZUur{1W`RK0iJ^5W8l)V z4!{uf!uhFEEQYjfE%HoQDK6>)>06&NCi3dHtq-$KL%_LlYQ7oV53~8p=h`mN%HSIK z5rdX!gWUyLUk|(cBGK3T7f4|OX*?iEHq&yNalE@PVn0!VBMvitIj|3u!`DH3h0U<0 zlgcELg_f13i)`avWPS$AG#MIz!AptVOf1}b0VAzsfZY1tBPeR->rCgN0%Czy)oejQ zx#N@G2-0I9Q+g4JfktIS+GM7AaO&tfXl;_gH9>dC`Qh&-8e%_kAyad=$ zu^_${%tBqh+;3U<1c7z9|88UGlgG9tq`0sE$BPwEZV)9*e<~PH(^YygoNP?S*87y# zmMKuer!P9!3dSqe!FlU#%VP)pRnWijw@bfH9ycNOl`Y_UyKQUf)z+>uvR}w2O%tXF zuSF3vny5dSKxda$0vKD&JC2DuZ^nYl3tv|QsBcZ!Eba?ynX{51N~i2i?nLC;%1F3! tBa0N)QlPqEZ8(r62%E>*;r#KBl{EhI`AHgf>HvA^`~ z_hs?^aF0)!Q^rh3@(59448dV^vo=0sNE%o<6ElOIbFmO2d)rSihLU3_U=zbII|qyL zR}x-to36R;>K67;!$Iz4y;f7z%}tA|7oB-td#Q#DmXgqC3A8Dg;z)U{EFB1QQuso~ zyX_yEEWpSiB3=uP+Eg aSt$LiZO0_EAW!puWZj_fn>Bu!kKH$?Rhh&9 literal 0 HcmV?d00001 diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java index 1cd0689..6d1452d 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java @@ -71,10 +71,11 @@ private int findElbowIndex(double[] data){ private double[][] prepare(double[][] data, int smoothingWindow){ //smooth the data to make local minimum/maximum easier to find (this is Step 1 in the paper) - double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); + // double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); //prepare the data into the unit range (step 2 of paper) - double[][] normalisedData = Maths.minmaxNormalise(smoothedData); + // double[][] normalisedData = Maths.minmaxNormalise(smoothedData); + double[][] normalisedData = Maths.minmaxNormalise(data); //subtract normalised x from normalised y (this is step 3 in the paper) for (int i = 0; i < normalisedData.length; i++) { @@ -105,7 +106,9 @@ public double findElbowQuick(double[] data){ } // double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 3)); // original parameter - double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 1)); + double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 0)); + + //do kneedle y'-x' (in this case x' is normalised index value) for (int i = 0; i < normalisedData.length; i++) { double normalisedIndex = (double)i / data.length; @@ -139,9 +142,11 @@ public ArrayList run(double[][] data, double s, int smoothingWindow, b ArrayList localMinMaxPts = new ArrayList<>(); //do steps 1,2,3 of the paper in the prepare method double[][] normalisedData = prepare(data, smoothingWindow); + //find candidate indices (this is step 4 in the paper) { ArrayList candidateIndices = findCandidateIndices(normalisedData, findElbows); + //ArrayList candidateIndices = findCandidateIndices(data, findElbows); //go through each candidate index, i, and see if the indices after i are satisfy the threshold requirement //(this is step 5 in the paper) double step = computeAverageVarianceX(normalisedData); @@ -194,117 +199,68 @@ public static void main(String[] args){ 2308, 2262, 2235, 2259, 2221, 2202, 2184, 2170, 2160, 2127, 2134, 2101, 2101, 2066, 2074, 2063, 2048, 2031 }; */ - double elbowdata2[] = {272445.84, - 139828.64, - 219647.36, - 149900.52, - 101875.555, - 90592.31, - 94776.5, - 59097.977, - 54506.95, - 70813.1, - 51619.59, - 72024.32, - 42364.402, - 49209.64, - 43121.777, - 58519.363, - 42506.32, - 53575.184, - 48930.42, - 67386.4, - 27424.889, - 58791.652, - 47980.53, - 57721.895, - 28586.846, - 47117.207, - 34060.79, - 46765.35, - 36411.176, - 38203.29, - 41664.164, - 30040.643, - 23410.227, - 37810.92, - 44158.805, - 36570.363, - 38791.527, - 26255.09, - 34368.848, - 33185.074, - 23464.494, - 58085.137, - 19323.424, - 28164.77, - 31947.02, - 34020.324, - 31572.951, - 40708.703, - 27046.771, - 37988.094, - 104162.72, - 33381.24, - 20126.354, - 23565.26, - 35915.094, - 34402.164, - 23505.94, - 25535.15, - 33915.32, - 25169.93, - 20888.271, - 36341.01, - 26020.947, - 29645.568, - 27043.643, - 24310.191, - 23757.668, - 19005.96, - 22007.072, - 17633.865, - 22680.45, - 11766.091, - 12725.509, - 34868.617, - 22989.531, - 23386.334, - 17618.283, - 22736.342, - 18922.049, - 24434.168, - 13263.041, - 9256.854, - 18594.143, - 21928.807, - 29263.688, - 16141.0205, - 14283.08, - 16031.739, - 14628.732, - 19026.465, - 16398.363, - 22941.205, - 25078.521, - 16121.506, - 10316.715, - 24983.184, - 17508.658, - 16489.285, - 9556.006, - 10829.478, + double elbowdata2[] = {5000, + 4000, + 3000, + 2000, + 1000, + 900, + 800, + 700, + 600, + 500, + 450, + 400, + 350, + 300, + 250, + 225, + 200, + 175, + 150, + 125, + 100, + 75, + 50, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 10, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, } ; - double elbow_point = elbowcalculator.findElbowQuick(elbowdata2); + double elbow_point = elbowcalculator.findElbowQuick(elbowdata2); + System.out.print("elbow point value form 1D data : "+ elbow_point); - double[][] elbowdata3 = new double[100][2] ; - for (int i= 0;i<=99;i++) { + double[][] elbowdata3 = new double[50][2] ; + for (int i= 0;i<=49;i++) { - elbowdata3[i][1]= 99-i;} + elbowdata3[i][1]= 49-i;} - for (int i= 0;i<=99;i++) + for (int i= 0;i<=49;i++) { elbowdata3[i][0]= elbowdata2[i]; } @@ -312,7 +268,7 @@ public static void main(String[] args){ // public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows) - ArrayList elbows = elbowcalculator.run ( elbowdata3, 0 , 1 , false); + ArrayList elbows = elbowcalculator.run ( elbowdata3, 1 , 1, false); System.out.print("\n" + "number of elbow points : " + elbows.size()); for (double[] point : elbows) { diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java index 979795c..30c6e06 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java @@ -6,7 +6,7 @@ import org.python.core.*; -class JythonTest2 +class JythonTest2 { //// does not work if there are external imports: @@ -29,7 +29,7 @@ public static void main(String[] args) { // xarray_1 = // yarray_2= - String[] arguments = new String[] {"python", "C:\\Users\\sayan\\eclipse-workspace\\pythonfunc\\pythonfunc2.py" , "huzhiwei", "25", "C:/Users/sayan/Documents/testdata/data.xlsx"}; +/* String[] arguments = new String[] {"python", "C:\\Users\\sayan\\eclipse-workspace\\pythonfunc\\pythonfunc2.py" , "huzhiwei", "25", "C:/Users/sayan/Documents/testdata/data.xlsx"}; try { Process process = Runtime.getRuntime().exec(arguments); BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream())); @@ -43,5 +43,41 @@ public static void main(String[] args) { } catch (Exception e) { e.printStackTrace(); } +*/ + String[] arguments2 = new String[] {"python", "C:\\Users\\sayan\\git\\rphash-java\\src\\main\\java\\edu\\uc\\rphash\\kneefinder\\KneeLocator.py" , "huzhiwei", "25", "C:/Users/sayan/Documents/testdata/data.xlsx"}; + try { + Process process = Runtime.getRuntime().exec(arguments2); + BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line = null; + while ((line = in.readLine()) != null) { + System.out.println(line); + } + in.close(); + int re = process.waitFor(); + System.out.println(re); + } catch (Exception e) { + e.printStackTrace(); + } + + int[] int_array_x = new int[] {1,2,3,4,5, 6,7,8,9,10,11,12,13,14,15,16,17,18 ,19,20,21}; + float[] float_array_y = new float[] {5000,4000,3000,2000,1000,900,800,700,600,500,450,400,350,300,250,225,200,175,150,125,100}; + + String[] arguments3 = new String[] {"python", "C:\\Users\\sayan\\git\\rphash-java\\src\\main\\java\\edu\\uc\\rphash\\kneefinder\\KneeLocator.py" , "huzhiwei", "25", "C:/Users/sayan/Documents/testdata/data.xlsx"}; + try { + Process process = Runtime.getRuntime().exec(arguments2); + BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line = null; + while ((line = in.readLine()) != null) { + System.out.println(line); + } + in.close(); + int re = process.waitFor(); + System.out.println(re); + } catch (Exception e) { + e.printStackTrace(); + } + + + } } diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest3.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest3.java new file mode 100644 index 0000000..7c8b069 --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest3.java @@ -0,0 +1,58 @@ +package edu.uc.rphash.kneefinder; +/* + * + * import org.python.util.PythonInterpreter; + * import org.python.core.PyInstance; + * + * import java.io.BufferedReader; import java.io.InputStreamReader; + * + * import org.python.core.*; + * + * class JythonTest3 { + * + * //// does not work if there are external imports: + * + * // public static void main(String[] args) { // PythonInterpreter interpreter + * = new PythonInterpreter(); // // interpreter.execfile( + * "C:\\Users\\sayan\\eclipse-workspace\\pythonfunc\\pythonfunc1.py"); // + * PyFunction function = + * (PyFunction)interpreter.get("my_test",PyFunction.class); // PyObject pyobject + * = function.__call__(new PyString("huzhiweiww"),new PyString("2225")); // + * System.out.println("anwser = " + pyobject.toString()); // } // + * + * static PythonInterpreter interpreter; + * + * @SuppressWarnings("resource") public static void main( String gargs[] ) { + * //String[] s = {"New York", "Chicago" , "errr"}; int[] s = new int[] + * {1,2,3,4,5, 6,7,8,9,10,11,12,13,14,15,16,17,18 ,19,20,21}; + * PythonInterpreter.initialize(System.getProperties(),System.getProperties(), + * s); interpreter = new PythonInterpreter(); interpreter.execfile( + * "C:\\Users\\sayan\\git\\rphash-java\\src\\main\\java\\edu\\uc\\rphash\\kneefinder\\PyScript.py" + * ); PyInstance hello = (PyInstance) interpreter.eval("PyScript" + "(" + "None" + * + ")"); } + * + * public void getData(Object[] data) { for (int i = 0; i < data.length; i++) { + * System.out.print(data[i].toString()); } + * + * } } + * + */ + + import org.python.util.PythonInterpreter; + import org.python.core.*; + + public class JythonTest3 { + public static void main(String a[]){ + + PythonInterpreter python = new PythonInterpreter(); + + int number1 = 5; + int number2 = 6; + + python.set("number1", new PyInteger(number1)); + python.set("number2", new PyInteger(number2)); + python.exec("number3 = number1+number2"); + PyObject number3 = python.get("number3"); + System.out.println("Returned Value is : "+number3.toString()); + } + } \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py index 3ba35f0..626ba8c 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py +++ b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py @@ -1,47 +1,20 @@ -#package edu.uc.rphash.kneefinder; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; -import edu.uc.rphash.lsh.LSH; -import java.util.ArrayList; -#import org.python.util.PythonInterpreter; -#import org.python.core.*; - - - import sys - from scipy.constants import convert_temperature - - import numpy as np - from scipy import interpolate - from scipy.signal import argrelextrema - from sklearn.preprocessing import PolynomialFeatures - from sklearn.linear_model import LinearRegression - import warnings - from typing import Tuple, Optional, Iterable - import matplotlib.pyplot as plt - import pandas as pd - - import warnings # did not install +#from edu.uc.rphash.kneefinder import JythonTest2 - - - def my_test(name, age, file): @@ -736,6 +709,11 @@ def all_norm_elbows_y(self): print(kn2.knee) +print("success") + + + + #print(kn.norm_knee) diff --git a/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java index 93fa888..17646ff 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java +++ b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java @@ -101,7 +101,7 @@ public double findElbowQuick(double[] data){ return 0; } - double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 3)); + double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 1)); //do kneedle y'-x' (in this case x' is normalised index value) for (int i = 0; i < normalisedData.length; i++) { diff --git a/src/main/java/edu/uc/rphash/kneefinder/PyScript.py b/src/main/java/edu/uc/rphash/kneefinder/PyScript.py new file mode 100644 index 0000000..32f0d0f --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/PyScript.py @@ -0,0 +1,13 @@ +#import JythonTest3 + +import sys + +class PyScript: + def __init__(self,txt): + city = [] + for i in range(0,len(sys.argv)): + city.append(str(sys.argv[i])) + print(city) +# jObj = JavaProg() +# jObj.getData(city) + print("Done") \ No newline at end of file From 0576de8dd0abb85bc8bcc955cfcfd912e0b5cc3c Mon Sep 17 00:00:00 2001 From: Sayantan Date: Thu, 12 May 2022 03:40:32 -0400 Subject: [PATCH 26/29] added another sub method for knee finding in java used linear interpolation to smoothen the initial data --- .../edu/uc/rphash/kneefinder/JythonTest.java | 19 +++++++-- .../edu/uc/rphash/kneefinder/KneeLocator.py | 7 +++- src/main/java/edu/uc/rphash/util/Maths.java | 42 +++++++++++++++++++ 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java index 6d1452d..ef0040b 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java @@ -71,11 +71,24 @@ private int findElbowIndex(double[] data){ private double[][] prepare(double[][] data, int smoothingWindow){ //smooth the data to make local minimum/maximum easier to find (this is Step 1 in the paper) - // double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); + double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); + double[][] smoothedData2 = Maths.Smooth2d(data); + System.out.println("this is the smoothed out data using gaussian kernal -------------------"); + System.out.println(Arrays.deepToString(smoothedData)); + System.out.println(data.length); + + System.out.println(";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;"); + + System.out.println("this is the smoothed out data using linear interpolation -------------------"); + System.out.println(Arrays.deepToString(smoothedData2)); + //prepare the data into the unit range (step 2 of paper) - // double[][] normalisedData = Maths.minmaxNormalise(smoothedData); - double[][] normalisedData = Maths.minmaxNormalise(data); + double[][] normalisedData = Maths.minmaxNormalise(smoothedData2 ); + + + + // double[][] normalisedData = Maths.minmaxNormalise(data); //subtract normalised x from normalised y (this is step 3 in the paper) for (int i = 0; i < normalisedData.length; i++) { diff --git a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py index 626ba8c..9a6c11a 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py +++ b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py @@ -155,6 +155,9 @@ def __init__( uspline = interpolate.interp1d(self.x, self.y) self.Ds_y = uspline(self.x) + print("this is the smoothed data---------------------------") + print(self.Ds_y) + elif interp_method == "polynomial": @@ -658,9 +661,9 @@ def all_norm_elbows_y(self): -nameoffile = my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling +#nameoffile = my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling -#nameoffile = my_test("sam","25", "C:/Users/sayan/Documents/testdata/data.xlsx") # this is for the python test +nameoffile = my_test("sam","25", "C:/Users/sayan/Documents/testdata/data.xlsx") # this is for the python test diff --git a/src/main/java/edu/uc/rphash/util/Maths.java b/src/main/java/edu/uc/rphash/util/Maths.java index 48c4d20..51e1882 100644 --- a/src/main/java/edu/uc/rphash/util/Maths.java +++ b/src/main/java/edu/uc/rphash/util/Maths.java @@ -2,6 +2,11 @@ import java.util.*; + +//import org.apache.commons.math.*; +import org.apache.commons.math3.analysis.interpolation.LinearInterpolator; +import org.apache.commons.math3.analysis.polynomials.PolynomialSplineFunction; + // taken from " https://github.com/lukehb/137-common/blob/master/src/main/java/onethreeseven/common/util/Maths.java by Luke Bermingham " /** * A utility of mathematical methods. @@ -537,6 +542,43 @@ public static double[][] gaussianSmooth2d(double[][] data, int w){ return smoothed; } + public static double[][] Smooth2d(double[][] data){ + // double linearInterp(double[] x, double[] y, double xi) + + int size = data.length; //50 + double x[] = new double[size]; + double xi[] = new double[size]; + double y[] = new double[size]; + double smooth_xy[][] =new double[size][2]; + + for ( int i=0 ; i<=size-1 ; i++) { + x[i] = data[(size-1)-i][1]; + y[i] = data[i][0]; + + } + + // return linear interpolation of (x,y) on xi + LinearInterpolator li = new LinearInterpolator(); + // + + PolynomialSplineFunction psf = li.interpolate(x,y); + + for ( int i=0 ; i<=size-1 ; i++) { + + smooth_xy[(size-1)-i][1]= x[i]; + + smooth_xy[i][0]= psf.value(x[i]); + + } + + + return smooth_xy; + + + } + + + /** * Normalise the 1d data using min-max normalisation. * @see Wikipedia article about feature re-scaling. From ddb5d66bc4c9833449c373da375389a75511eb32 Mon Sep 17 00:00:00 2001 From: Sayantan Date: Thu, 12 May 2022 03:44:12 -0400 Subject: [PATCH 27/29] minor update --- src/main/java/edu/uc/rphash/util/Maths.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/edu/uc/rphash/util/Maths.java b/src/main/java/edu/uc/rphash/util/Maths.java index 51e1882..f7ea5b1 100644 --- a/src/main/java/edu/uc/rphash/util/Maths.java +++ b/src/main/java/edu/uc/rphash/util/Maths.java @@ -542,6 +542,8 @@ public static double[][] gaussianSmooth2d(double[][] data, int w){ return smoothed; } + + public static double[][] Smooth2d(double[][] data){ // double linearInterp(double[] x, double[] y, double xi) From 9194bfcd0a7a1eb6681ef398ec531ad7eee7aeca Mon Sep 17 00:00:00 2001 From: Sayantan Date: Mon, 1 Aug 2022 05:43:25 -0400 Subject: [PATCH 28/29] updating --- src/main/java/edu/uc/rphash/PPAHStream.java | 1115 ++++++++++++----- .../java/edu/uc/rphash/PPAHStream_v2.java | 320 +++++ .../java/edu/uc/rphash/PRPHashStream.java | 4 +- .../uc/rphash/Readers/SimpleArrayReader.java | 31 + src/main/java/edu/uc/rphash/TWRPv4.java | 2 + src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java | 12 +- .../TWRPv6_wcss_offline2_TEST2_10runs.java | 2 +- .../edu/uc/rphash/kneefinder/JythonTest.java | 68 +- .../java/edu/uc/rphash/tests/test_elbow.java | 89 ++ 9 files changed, 1306 insertions(+), 337 deletions(-) create mode 100644 src/main/java/edu/uc/rphash/PPAHStream_v2.java create mode 100644 src/main/java/edu/uc/rphash/tests/test_elbow.java diff --git a/src/main/java/edu/uc/rphash/PPAHStream.java b/src/main/java/edu/uc/rphash/PPAHStream.java index 59939cc..559120e 100644 --- a/src/main/java/edu/uc/rphash/PPAHStream.java +++ b/src/main/java/edu/uc/rphash/PPAHStream.java @@ -1,324 +1,791 @@ -package edu.uc.rphash; - - - - -/* - This class will run the Parameter-free Projected Adaptive Hash Stream Clustering - */ -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Random; -import java.util.Map.Entry; -import java.util.TreeSet; -import java.util.stream.Stream; - -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.generators.GenerateStreamData; - - - - -public class PPAHStream implements StreamClusterer { - - - private float[] rngvec; - private List centroids = null; - private RPHashObject so; - // #create projector matrixs - Projector projector ; - int ct=0; - int pdim = 20; - - public PPAHStream(int k, GenerateStreamData gen, int i) { - so = new SimpleArrayReader(gen,k); - projector = so.getProjectionType(); - projector.setOrigDim(so.getdim()); - projector.setProjectedDim(pdim); - projector.setRandomSeed(so.getRandomSeed()); - projector.init(); - initTablesWith(); - } - - public List getCentroids(RPHashObject so) { - this.so = so; - return getCentroids(); - } - - - /* - * X - set of vectors compute the medoid of a vector set - */ - /** Add vector to running Centroid - * @param cnt_1,cnt_2 - * @param x_1 - */ - public static float[] update_cent(int ct, float[] x, float[] cent){ - for(int i=0;i 0) - s += 1; - addcent(s,x); - } - return s; - } - - - /* - * ===========================MinCount Sketch======================= - */ - public static final long PRIME_MODULUS = (1L << 31) - 1; - private int depth; - private int width; - private int[][] tableS; - private float[][][] tableCent; - private long[] hashA; - - - private void initTablesWith() { - this.width = (int) Math.ceil(2 / .025); - this.depth = (int) Math.ceil(-Math.log(1 - .97) / Math.log(2)); - this.tableS = new int[depth][width]; - this.tableCent = new float[depth][width][];//we will fill these in as we need them - this.hashA = new long[depth];//hash offsets - Random r = new Random(); - for (int i = 0; i < depth; ++i) { - hashA[i] = r.nextLong(); - } - } - - private int hash(long item, int i) { - long hash = hashA[i] * item; - hash += hash >>> 32; - hash &= PRIME_MODULUS; - return (int) (hash % width); - - } - - private int count(long lshhash) { - int min = (int) tableS[0][hash(lshhash, 0)]; - for (int i = 1; i < depth; ++i) { - if (tableS[i][hash(lshhash, i)] < min) - min = (int) tableS[i][hash(lshhash, i)]; - } - return min; - } - - private float[] get_cent_sketch(long lshhash) { - int min = (int) tableS[0][hash(lshhash, 0)]; - int mini = 0; - int minhtmp = 0; - for (int i = 1; i < depth; ++i) { - int htmp = hash(lshhash, i); - if (tableS[i][hash(lshhash, i)] < min){ - mini = i; - minhtmp = htmp; - min = (int) tableS[i][htmp]; - } - } - - return tableCent[mini][minhtmp]; - } - - private void addcent(long lshhash, float[] x){ - - int htmp = hash(lshhash, 0); - int argmini = 0; - int argminhtmp = htmp; - - tableS[0][htmp] += 1; - int min = (int) tableS[0][htmp]; - - for (int i = 1; i < depth; ++i) { - htmp = hash(lshhash, i); - tableS[i][htmp] += 1; - - if (tableS[i][htmp] < min){ - min = (int) tableS[i][htmp]; - argmini = i; - argminhtmp = htmp; - } - } - - if(tableCent[argmini][argminhtmp]==null){ - tableCent[argmini][argminhtmp] = x; - } - else{ - update_cent(min, x, tableCent[argmini][argminhtmp]); - } - } - /* - * ===========================MinCount Sketch======================= - */ - - - - /* - * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid - * vector map - * - * hash the projected vector x and update the hash to centroid and counts - * maps - */ - void addtocounter(float[] x, Projector p) { - float[] xt = p.project(x); - hashvec(xt, x); - } - - @Override - public long addVectorOnlineStep(float[] x) { - addtocounter(x, projector); - return 0; - } - - @Override - public List getCentroidsOfflineStep() { - - // next we want to prune the tree by parent count comparison - // follows breadthfirst search - HashMap densityAndID = new HashMap(); - for (Long cur_id =0l;cur_id<2<>> 1; - long parent_count = count(parent_id); - - if (2 * cur_count > parent_count) { - densityAndID.put(parent_id, 0l); - densityAndID.put(cur_id,cur_count); - } - } - - //remove keys with support less than 2 - Stream> stream = densityAndID.entrySet().stream().filter(p -> p.getValue() > 1); - //64 so 6 bits? - //stream = stream.filter(p -> p.getKey() > 64); - - List sortedIDList= new ArrayList<>(); - // sort and limit the list - stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*1) - .forEachOrdered(x -> sortedIDList.add(x.getKey())); - - // compute centroids - List estcents = new ArrayList<>(); - for (int i = 0; i < sortedIDList.size(); i++) { - System.out.println(densityAndID.get(sortedIDList.get(i))); - if(get_cent_sketch(sortedIDList.get(i))!=null) - estcents.add(new Centroid( get_cent_sketch(sortedIDList.get(i)))); - } - - return estcents; - } - - @Override - public void shutdown() { - } - - @Override - public int getProcessors() { - return 0; - } - - @Override - public List getCentroids() { - return null; - } - - - public static void main(String[] args) throws Exception { - - int k = 20; - int d = 1000; - int interval = 10000; - float var = 1f; - - Runtime rt = Runtime.getRuntime(); - GenerateStreamData gen = new GenerateStreamData(k, d, var, 1133131); - - StreamClusterer rphit = new PPAHStream(k, gen, 1); - //StreamClusterer rphit = new RPHashStreaming(k, gen, 1); - - ArrayList vecsInThisRound = new ArrayList(); - - System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\n"); - long timestart = System.nanoTime(); - for (int i = 0; i < interval * 10; i++) { - vecsInThisRound.add(gen.generateNext()); - if (i % interval == interval - 1) { - timestart = System.nanoTime(); - for (float[] f : vecsInThisRound) { - rphit.addVectorOnlineStep(f); - } - - List cents = rphit.getCentroidsOfflineStep(); - long time = System.nanoTime() - timestart; - rt.gc(); - long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - double wcsse = StatTests.WCSSECentroidsFloat(cents, - vecsInThisRound); - vecsInThisRound = new ArrayList(); - System.out.printf("%d\t%d\t%.4f\t%.4f\n", i, usedkB, - time / 1000000000f, wcsse); - } - } - } - @Override - public RPHashObject getParam() { - return so; - } - - @Override - public void setWeights(List counts) { - // TODO Auto-generated method stub - - } - - @Override - public void setData(List centroids) { - this.centroids = centroids; - - } - - @Override - public void setRawData(List centroids) { - if (this.centroids == null) - this.centroids = new ArrayList<>(centroids.size()); - for (float[] f : centroids) { - this.centroids.add(new Centroid(f, 0)); - } - } - - @Override - public void setK(int getk) { - this.so.setK(getk); - } - - @Override - public void reset(int randomseed) { - centroids = null; - so.setRandomSeed(randomseed); - } - - @Override - public boolean setMultiRun(int runs) { - // TODO Auto-generated method stub - return false; - } - -} +package edu.uc.rphash; + +// This class will run the Parameter-free Projected Adaptive Hash Stream Clustering + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; +import java.util.Collections; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; +import edu.uc.rphash.tests.clusterers.DBScan; +import edu.uc.rphash.tests.clusterers.MultiKMPP; + + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + +// https://www.javatips.net/api/webofneeds-master/webofneeds/won-matcher-solr/src/main/java/won/matcher/solr/utils/Kneedle.java +// https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java + +// this algorithm runs twrp 3 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. + + +public class PPAHStream implements Clusterer, Runnable { + + + List labels; // to directly output labels + HashMap labelmap; // to directly output labels + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + + + boolean znorm = false; + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float eps; + + private List centroids = null; + + private RPHashObject so; + + public PPAHStream(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } +// float wcss = (distancesq(x_r,x_2)/cnt_r) + wcss_1; +// float wcss = ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_1); + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + public void printHashmap(HashMap hashmap) { + + System.out.println(hashmap.keySet()); + System.out.println(hashmap.values()); + + } +public void printStream(Stream> stream) { + + //System.out.println(hashmap.keySet()); + System.out.println(stream.count()); + +} +// this method calculates the epsilon value and prints the information. +public float printInfo(ListsetofKeys, HashMap MapOfIDAndCount, HashMap MapOfIDAndCent, HashMap MapOfIDAndWCSS) { + + List counts = new ArrayList<>(); + List wcsseprint = new ArrayList<>(); +// float temp = 0; + int elements=0; + float avg=0; + + for (Long keys: setofKeys) + { + elements=elements+1; +//// System.out.println(MapOfIDAndCount.get(keys)); + counts.add(MapOfIDAndCount.get(keys)); + wcsseprint.add(MapOfIDAndWCSS.get(keys)); + + } +// System.out.println(); + System.out.print(counts); + +// for (Long keys: setofKeys) +// { +// System.out.println(MapOfIDAndWCSS.get(keys)); +// wcsseprint.add(MapOfIDAndWCSS.get(keys)); +// } + + // calculation of epsilon + /* + for (int i=0 ; i<(0.8*elements); i++) //for (int i=0 ; i<(0.8*elements); i++) + { + temp = temp + (wcsseprint.get(i))/(counts.get(i)); + } + avg = (float) (temp/(0.8*elements)); + System.out.println(); + System.out.println("\taverage epsilon = "+ avg); + */ + Collections.sort(wcsseprint); + Collections.reverse(wcsseprint); + System.out.println(); + System.out.println(wcsseprint); + System.out.println(); + + return (avg); + } + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + + projector.setRandomSeed(so.getRandomSeed()); + //projector.setRandomSeed(949124732); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + int ct2 = 0; + int ct3 =0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + } + } + + System.out.println("\nNumberOfVertors = , "+ ct); + System.out.println("\nNumberOfMicroClustersBeforePruning = , "+ MapOfIDAndCent1.size()); + //printHashmap(MapOfIDAndCount1); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + // printHashmap(denseSetOfIDandCount2_1); + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + //printHashmap(denseSetOfIDandCount2_2); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + //printHashmap(denseSetOfIDandCount2_3); + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + HashMap denseSetOfIDandCount2 = new HashMap(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS1); +// System.out.println(" wcss_ofline_calc_1 = " + WCSS_off_1); + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS2); +// System.out.println(" wcss_ofline_calc_2 = " + WCSS_off_2); + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS3); +// System.out.println(" wcss_ofline_calc_3 = " + WCSS_off_3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClusters_AfterPruning_&_beforesortingLimit = , "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 2); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + System.out.println("------------------------------------------------------------------------------------------------------------------"); + //printHashmap(denseSetOfIDandCount2); + float eps= printInfo(sortedIDList2,denseSetOfIDandCount2, MapOfIDAndCent,MapOfIDAndWCSS); +// seteps(eps); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + + Random r = new Random(); + //Random r = new Random(923063597592675214L) ; + Random r2 = new Random(); + //Random r2 = new Random(923063597592675214L) ; + Random r3 = new Random(); + //Random r3 = new Random(923063597592675214L) ; + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) { + rngvec[i] = (float) r.nextGaussian(); + //System.out.println(rngvec[i]); + } + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + System.out.println("\tNumberOfMicroClusters_AfterPruning = , "+ WeightAndClusters.size()); +// System.out.println("getRandomVector = "+ randVect); + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); +// aggloOffline2.setWeights(weights2); + this.centroids = aggloOffline2.getCentroids(); + +// MultiKMPP aggloOffline3 = new MultiKMPP(centroids2,so.getk()); +// this.centroids = aggloOffline3.getCentroids(); + +//// DBScan algo = new DBScan(centroids2, (eps/(20)), 3); +//// System.out.println("epsssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss = "+ eps/(20)); +//// this.centroids = algo.getCentroids(); +//// System.out.println("no. of final output centroids = "+ centroids.size()); + + } + + public static void main(String[] args) throws FileNotFoundException, + IOException, InterruptedException { + + System.gc(); + + // int k ; //= 10; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + // String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + + // float f = var; + // float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/C:/Users/deysn/Desktop/temp/run_results/3runs/rough/1D.txt")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; C:/Users/deysn/Documents/temp/covtype/1D.txt + // C:/Users/dey.sn/Downloads/temp/covtype/1D.csv ; "C:/Users/dey.sn/Downloads/temp/run_results/3runs/har_k6/1D.txt" + String inputfile = "C:/Users/dey.sn/Downloads/temp/crop_mapping/1D.csv" ; + System.out.println(inputfile); + data = VectorUtil.readFile( inputfile , raw); + for (int k=4; k<=11;k++) + { + for (int i = 1; i <= 3; i++) + { + //k = 7; + + RPHashObject o = new SimpleArrayReader(data, k); + + o.setDimparameter(16); + o.setCutoff(130); //230 + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST rphit = new TWRPv6_wcss_offline2_TEST(o); + + System.gc(); + + Runtime rt = Runtime.getRuntime(); + rt.gc(); + Thread.sleep(10); + rt.gc(); + long startmemory = rt.totalMemory() - rt.freeMemory(); + long startTime = System.nanoTime(); + + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 1000000000f ; + + float usedMB = ((rt.totalMemory() - rt.freeMemory()) - startmemory) / (1024*1024); + + System.out.println(" Time(in sec), " + avgtime + ", Mem_Used(MB):, " + (usedMB/3) ); + + rt.gc(); + Thread.sleep(10); + rt.gc(); + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); +// String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + String Output = "C:/Users/dey.sn/Downloads/work/output/cropmap_k7/cropmap_k7_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// VectorUtil.writeVectorFile(new File(Output+"_"+"labels"+".txt"), centsr.getLabels()); + + +// System.out.printf("WCSS for generated data = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf(",WCSS for Winning Kmeans, = , "+ "%.0f ", StatTests.WCSSECentroidsFloat(centsr, data)); + System.out.println(",k, is: , "+k); +// + System.gc(); + } + } + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + public void seteps(float eps) { + this.eps=eps; + } +} diff --git a/src/main/java/edu/uc/rphash/PPAHStream_v2.java b/src/main/java/edu/uc/rphash/PPAHStream_v2.java new file mode 100644 index 0000000..8082479 --- /dev/null +++ b/src/main/java/edu/uc/rphash/PPAHStream_v2.java @@ -0,0 +1,320 @@ +package edu.uc.rphash; + +/* + This class will run the Parameter-free Projected Adaptive Hash Stream Clustering + */ +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; +import java.util.Map.Entry; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.generators.GenerateStreamData; + + + + +public class PPAHStream_v2 implements StreamClusterer { + + + private float[] rngvec; + private List centroids = null; + private RPHashObject so; + // #create projector matrixs + Projector projector ; + int ct=0; + int pdim = 20; + + public PPAHStream_v2(int k, GenerateStreamData gen, int i) { + so = new SimpleArrayReader(gen,k); + projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(pdim); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + initTablesWith(); + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + /** Add vector to running Centroid + * @param cnt_1,cnt_2 + * @param x_1 + */ + public static float[] update_cent(int ct, float[] x, float[] cent){ + for(int i=0;i 0) + s += 1; + addcent(s,x); + } + return s; + } + + + /* + * ===========================MinCount Sketch======================= + */ + public static final long PRIME_MODULUS = (1L << 31) - 1; + private int depth; + private int width; + private int[][] tableS; + private float[][][] tableCent; + private long[] hashA; + + + private void initTablesWith() { + this.width = (int) Math.ceil(2 / .025); + this.depth = (int) Math.ceil(-Math.log(1 - .97) / Math.log(2)); + this.tableS = new int[depth][width]; + this.tableCent = new float[depth][width][];//we will fill these in as we need them + this.hashA = new long[depth];//hash offsets + Random r = new Random(); + for (int i = 0; i < depth; ++i) { + hashA[i] = r.nextLong(); + } + } + + private int hash(long item, int i) { + long hash = hashA[i] * item; + hash += hash >>> 32; + hash &= PRIME_MODULUS; + return (int) (hash % width); + + } + + private int count(long lshhash) { + int min = (int) tableS[0][hash(lshhash, 0)]; + for (int i = 1; i < depth; ++i) { + if (tableS[i][hash(lshhash, i)] < min) + min = (int) tableS[i][hash(lshhash, i)]; + } + return min; + } + + private float[] get_cent_sketch(long lshhash) { + int min = (int) tableS[0][hash(lshhash, 0)]; + int mini = 0; + int minhtmp = 0; + for (int i = 1; i < depth; ++i) { + int htmp = hash(lshhash, i); + if (tableS[i][hash(lshhash, i)] < min){ + mini = i; + minhtmp = htmp; + min = (int) tableS[i][htmp]; + } + } + + return tableCent[mini][minhtmp]; + } + + private void addcent(long lshhash, float[] x){ + + int htmp = hash(lshhash, 0); + int argmini = 0; + int argminhtmp = htmp; + + tableS[0][htmp] += 1; + int min = (int) tableS[0][htmp]; + + for (int i = 1; i < depth; ++i) { + htmp = hash(lshhash, i); + tableS[i][htmp] += 1; + + if (tableS[i][htmp] < min){ + min = (int) tableS[i][htmp]; + argmini = i; + argminhtmp = htmp; + } + } + + if(tableCent[argmini][argminhtmp]==null){ + tableCent[argmini][argminhtmp] = x; + } + else{ + update_cent(min, x, tableCent[argmini][argminhtmp]); + } + } + /* + * ===========================MinCount Sketch======================= + */ + + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p) { + float[] xt = p.project(x); + hashvec(xt, x); + } + + @Override + public long addVectorOnlineStep(float[] x) { + addtocounter(x, projector); + return 0; + } + + @Override + public List getCentroidsOfflineStep() { + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + HashMap densityAndID = new HashMap(); + for (Long cur_id =0l;cur_id<2<>> 1; + long parent_count = count(parent_id); + + if (2 * cur_count > parent_count) { + densityAndID.put(parent_id, 0l); + densityAndID.put(cur_id,cur_count); + } + } + + //remove keys with support less than 2 + Stream> stream = densityAndID.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + + List sortedIDList= new ArrayList<>(); + // sort and limit the list + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*1) + .forEachOrdered(x -> sortedIDList.add(x.getKey())); + + // compute centroids + List estcents = new ArrayList<>(); + for (int i = 0; i < sortedIDList.size(); i++) { + System.out.println(densityAndID.get(sortedIDList.get(i))); + if(get_cent_sketch(sortedIDList.get(i))!=null) + estcents.add(new Centroid( get_cent_sketch(sortedIDList.get(i)))); + } + + return estcents; + } + + @Override + public void shutdown() { + } + + @Override + public int getProcessors() { + return 0; + } + + @Override + public List getCentroids() { + return null; + } + + + public static void main(String[] args) throws Exception { + + int k = 10; + int d = 100; + int interval = 1000; + float var = 1f; + + Runtime rt = Runtime.getRuntime(); + GenerateStreamData gen = new GenerateStreamData(k, d, var, 1133131); + + StreamClusterer rphit = new PPAHStream_v2(k, gen, 1); + //StreamClusterer rphit = new RPHashStreaming(k, gen, 1); + + ArrayList vecsInThisRound = new ArrayList(); + + System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\n"); + long timestart = System.nanoTime(); + for (int i = 0; i < interval * 6; i++) { + vecsInThisRound.add(gen.generateNext()); + if (i % interval == interval - 1) { + timestart = System.nanoTime(); + for (float[] f : vecsInThisRound) { + rphit.addVectorOnlineStep(f); + } + + List cents = rphit.getCentroidsOfflineStep(); + long time = System.nanoTime() - timestart; + rt.gc(); + long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; + double wcsse = StatTests.WCSSECentroidsFloat(cents, vecsInThisRound); + vecsInThisRound = new ArrayList(); + System.out.printf("%d\t%d\t%.4f\t%.4f\n", i, usedkB, + time / 1000000000f, wcsse); + } + } + } + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + // TODO Auto-generated method stub + return false; + } + +} diff --git a/src/main/java/edu/uc/rphash/PRPHashStream.java b/src/main/java/edu/uc/rphash/PRPHashStream.java index 462c99d..a906431 100644 --- a/src/main/java/edu/uc/rphash/PRPHashStream.java +++ b/src/main/java/edu/uc/rphash/PRPHashStream.java @@ -1,5 +1,7 @@ package edu.uc.rphash; - +/* +This class will run the Parameter-free Random Projection Hash Stream Clustering +*/ import java.util.ArrayList; import java.util.List; import java.util.Random; diff --git a/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java b/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java index 8fe1916..322ab22 100644 --- a/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java +++ b/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java @@ -99,6 +99,37 @@ public SimpleArrayReader(List X, int k) { // topIDs.add((long) 0); } + public SimpleArrayReader(List X) { + + this.randomSeed = new Random().nextLong(); + this.hashmod = DEFAULT_HASH_MODULUS; + this.decoderMultiplier = DEFAULT_NUM_DECODER_MULTIPLIER; + if(this.decoderMultiplier>1) + this.dec = new MultiDecoder(this.decoderMultiplier*DEFAULT_INNER_DECODER.getDimensionality(),DEFAULT_INNER_DECODER); + else + this.dec = DEFAULT_INNER_DECODER; + this.numProjections = DEFAULT_NUM_PROJECTIONS; + this.numBlur = DEFAULT_NUM_BLUR; + this.data = X; + if(data!=null) + this.dim = data.get(0).length; + else + this.dim = null; + // this.k = k; + this.centroids = new ArrayList(); + this.topIDs = new ArrayList(); + this.decayrate = 0; + this.dimparameter = DEFAULT_DIM_PARAMETER; + this.clusterer = DEFAULT_OFFLINE_CLUSTERER; + this.projector = DEFAULT_PROJECTOR; +// for (int i = 0; i < k; i++) +// topIDs.add((long) 0); + } + + + + + // public SimpleArrayReader(List X, int k, int blur) { // // this.randomSeed = DEFAULT_NUM_RANDOM_SEED; diff --git a/src/main/java/edu/uc/rphash/TWRPv4.java b/src/main/java/edu/uc/rphash/TWRPv4.java index df32d4d..06bb22d 100644 --- a/src/main/java/edu/uc/rphash/TWRPv4.java +++ b/src/main/java/edu/uc/rphash/TWRPv4.java @@ -1,5 +1,7 @@ package edu.uc.rphash; + +/* This class uses the bisection vector as the lsh partition vector */ import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; diff --git a/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java b/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java index 0a5ab56..53c2de4 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java @@ -558,9 +558,9 @@ public static void main(String[] args) throws FileNotFoundException, IOException { int k = 10;//6; - int d = 200;//16; - int n = 10000; - float var = 1.5f; + int d = 100;//16; + int n = 1000; + float var = 1f; int count = 1; // System.out.printf("ClusterVar\t"); // for (int i = 0; i < count; i++) @@ -573,7 +573,7 @@ public static void main(String[] args) throws FileNotFoundException, float avgrealwcss = 0; float avgtime = 0; // System.out.printf("%f\t", f); - GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, 1f); // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); @@ -598,12 +598,12 @@ public static void main(String[] args) throws FileNotFoundException, avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), gen.getData()); - VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); +// VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); System.gc(); - System.out.printf("%.0f\n", avgrealwcss / count); +// System.out.printf("%.0f\n", avgrealwcss / count); } diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java index 75eed62..d455a80 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java @@ -32,7 +32,7 @@ -// this algorithm runs twrp 5 times : (only the random bisection vector varies, the Projection matrix remains same) +// this algorithm runs twrp 10 times : (only the random bisection vector varies, the Projection matrix remains same) // and selects the one which has the best wcss offline for the 10X candidate centroids. public class TWRPv6_wcss_offline2_TEST2_10runs implements Clusterer, Runnable { diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java index ef0040b..17df750 100644 --- a/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest.java @@ -9,6 +9,8 @@ import java.util.ArrayList; import java.util.Random; import java.util.Arrays; +import java.util.HashMap; +import java.util.List; // to find the knee, modified from " https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java by Luke Bermingham " @@ -73,11 +75,11 @@ private double[][] prepare(double[][] data, int smoothingWindow){ //smooth the data to make local minimum/maximum easier to find (this is Step 1 in the paper) double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); double[][] smoothedData2 = Maths.Smooth2d(data); - System.out.println("this is the smoothed out data using gaussian kernal -------------------"); - System.out.println(Arrays.deepToString(smoothedData)); - System.out.println(data.length); + // System.out.println("this is the smoothed out data using gaussian kernal -------------------"); + // System.out.println(Arrays.deepToString(smoothedData)); + // System.out.println(data.length); - System.out.println(";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;"); + // System.out.println(";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;"); System.out.println("this is the smoothed out data using linear interpolation -------------------"); System.out.println(Arrays.deepToString(smoothedData2)); @@ -187,6 +189,62 @@ public ArrayList run(double[][] data, double s, int smoothingWindow, b } +// method to call to find elbow + + public int find_elbow( List counts ){ + + int first_elbow; + int size_of_list = counts.size(); + int cutoff = 0; + // System.out.print("\n" + " size_of_list : " + size_of_list); + + if(size_of_list >= 100){ + cutoff = 100; + } + if(size_of_list < 100){ + cutoff = size_of_list ; + } + + System.out.print("\n" + " cutoff : " + cutoff + "\n"); + + List counts1 = counts; + + double[][] elbowdata = new double[cutoff][2] ; + + for (int i= 0;i<(cutoff-1);i++) { + + elbowdata[i][1]= (cutoff-1)-i;} // index + + for (int i= 0;i run(double[][] data, double s, int smoothingWindow, boolean findElbows) + List list_of_elbows= new ArrayList<>(); + +ArrayList elbows = run ( elbowdata, 1 , 1, false); + +System.out.print("\n" + "number of elbow points : " + elbows.size()); +for (double[] point : elbows) { +//System.out.print("\n" +"Knee point:" + Arrays.toString(point)); +//System.out.println("\n" +"No. of clusters complement = " + point[1] ); +//System.out.println("\n" + "No. of clusters = " + (elbowdata.length - point[1])); + +list_of_elbows.add(elbowdata.length - point[1]); + } + + +first_elbow = (int) list_of_elbows.get(0).intValue(); + +return first_elbow ; + + } + + + + // to test the funtion : public static void main(String[] args){ @@ -261,7 +319,7 @@ public static void main(String[] args){ 9, 9, 9, - 9, + 8, } ; double elbow_point = elbowcalculator.findElbowQuick(elbowdata2); diff --git a/src/main/java/edu/uc/rphash/tests/test_elbow.java b/src/main/java/edu/uc/rphash/tests/test_elbow.java new file mode 100644 index 0000000..2b994fa --- /dev/null +++ b/src/main/java/edu/uc/rphash/tests/test_elbow.java @@ -0,0 +1,89 @@ +package edu.uc.rphash.tests; + +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.List; + +import edu.uc.rphash.kneefinder.JythonTest; + + +public class test_elbow { + + + public static void main(String[] args){ + + + List counts = new ArrayList<>(50); + + double elbowdata[] = {5000, + 4000, + 3000, + 2000, + 1000, + 900, + 800, + 700, + 600, + 500, + 450, + 400, + 350, + 300, + 250, + 225, + 200, + 175, + 150, + 125, + 100, + 75, + 50, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 10, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 8, + } ; + + int size = elbowdata.length ; + + for (int i= 0;i findDensityModes2() { for (float[] x : so.getRawData()) { addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); - addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); - addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct2++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct3++, rngvec3,MapOfIDAndWCSS3); } } - System.out.println("\nNumberOfVertors = , "+ ct); + System.out.println("\nNumberOfVectors = , "+ ct); System.out.println("\nNumberOfMicroClustersBeforePruning = , "+ MapOfIDAndCent1.size()); //printHashmap(MapOfIDAndCount1); @@ -331,7 +343,8 @@ public Multimap findDensityModes2() { HashMap denseSetOfIDandCount2_1 = new HashMap(); for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) { - if (cur_id >so.getk()){ + //if (cur_id >so.getk()){ + if (cur_id > Long.valueOf(3)){ int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); long parent_id = cur_id>>>1; int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); @@ -369,7 +382,8 @@ public Multimap findDensityModes2() { for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) { - if (cur_id >so.getk()){ + //if (cur_id >so.getk()){ + if (cur_id > Long.valueOf(7)){ int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); long parent_id = cur_id>>>1; int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); @@ -407,7 +421,8 @@ public Multimap findDensityModes2() { HashMap denseSetOfIDandCount2_3 = new HashMap(); for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) { - if (cur_id >so.getk()){ + //if (cur_id >so.getk()){ + if (cur_id > Long.valueOf(11)){ int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); long parent_id = cur_id>>>1; int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); @@ -443,16 +458,15 @@ public Multimap findDensityModes2() { //remove keys with support less than 1 - Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); - + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); List sortedIDList2_1= new ArrayList<>(); // sort and limit the list stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); // printHashmap(denseSetOfIDandCount2_1); - Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); List sortedIDList2_2= new ArrayList<>(); // sort and limit the list stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) @@ -461,7 +475,6 @@ public Multimap findDensityModes2() { Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); - List sortedIDList2_3= new ArrayList<>(); // sort and limit the list stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) @@ -543,7 +556,9 @@ else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) //printHashmap(denseSetOfIDandCount2); float eps= printInfo(sortedIDList2,denseSetOfIDandCount2, MapOfIDAndCent,MapOfIDAndWCSS); // seteps(eps); - + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); for (Long keys: sortedIDList2) @@ -671,25 +686,28 @@ public static void main(String[] args) throws FileNotFoundException, List data = null; // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; C:/Users/deysn/Documents/temp/covtype/1D.txt // C:/Users/dey.sn/Downloads/temp/covtype/1D.csv ; "C:/Users/dey.sn/Downloads/temp/run_results/3runs/har_k6/1D.txt" - String inputfile = "C:/Users/dey.sn/Downloads/temp/crop_mapping/1D.csv" ; + //String inputfile = "C:/Users/dey.sn/Downloads/temp/crop_mapping/1D.csv" ; + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/sensorless_drive/1D.csv" ; System.out.println(inputfile); data = VectorUtil.readFile( inputfile , raw); - for (int k=4; k<=11;k++) + for (int k=10; k<=10 ;k++) { - for (int i = 1; i <= 3; i++) + for (int i = 1; i <= 5; i++) { //k = 7; RPHashObject o = new SimpleArrayReader(data, k); o.setDimparameter(16); - o.setCutoff(130); //230 + o.setCutoff(250); //230 o.setRandomVector(true); // System.out.println("cutoff = "+ o.getCutoff()); // System.out.println("get_random_Vector = "+ o.getRandomVector()); - TWRPv6_wcss_offline2_TEST rphit = new TWRPv6_wcss_offline2_TEST(o); +// TWRPv6_wcss_offline2_TEST rphit = new TWRPv6_wcss_offline2_TEST(o); + PPAHStream rphit = new PPAHStream(o); + System.gc(); @@ -714,7 +732,8 @@ public static void main(String[] args) throws FileNotFoundException, // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); // String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; - String Output = "C:/Users/dey.sn/Downloads/work/output/cropmap_k7/cropmap_k7_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ; + //String Output = "C:/Users/dey.sn/Downloads/work/output/cropmap_k7/cropmap_k7_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ; + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ; VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); // VectorUtil.writeVectorFile(new File(Output+"_"+"labels"+".txt"), centsr.getLabels()); diff --git a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java index 2d5b0a5..6e7f16d 100644 --- a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java +++ b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java @@ -44,6 +44,8 @@ public interface RPHashObject { final static Projector DEFAULT_PROJECTOR = new DBFriendlyProjection(); //final static Projector DEFAULT_PROJECTOR = new GaussianProjection(); + + int getdim(); diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java index d455a80..4f9532b 100644 --- a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java @@ -17,6 +17,7 @@ import edu.uc.rphash.Readers.RPHashObject; import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; import edu.uc.rphash.projections.Projector; import edu.uc.rphash.tests.StatTests; import edu.uc.rphash.tests.clusterers.Agglomerative3; @@ -37,6 +38,9 @@ public class TWRPv6_wcss_offline2_TEST2_10runs implements Clusterer, Runnable { boolean znorm = false; + int [] num_of_clusters_stage1 = new int[12]; + int min_k; + int max_k; // convert this to an array of arrays private int counter; @@ -50,6 +54,8 @@ public class TWRPv6_wcss_offline2_TEST2_10runs implements Clusterer, Runnable { private float[] rngvec8; private float[] rngvec9; private float[] rngvec10; + private float[] rngvec11; + private float[] rngvec12; private List centroids = null; @@ -86,7 +92,7 @@ public static float distancesq(float[] x, float[] y) { } -// This method finds the largest of the numbers and returns that index. +// This method finds the smallest of the numbers and returns that index. public static int smallest(float[] arr) { @@ -150,21 +156,7 @@ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, } -// this method is used to calculate the offline wcss -// UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); - - public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { - - float wcss = wcss_1 + distancesq(x_1,x_2); - - - float[][] ret = new float[3][]; - ret[0] = new float[1]; - ret[2]= new float [1]; - ret[2][0]= wcss; - return ret; - - } + public long hashvec2( float[] xt, float[] x, HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { @@ -211,39 +203,7 @@ public long hashvec2( float[] xt, float[] x, return s; } -// this hash is to calculate the wcss -// hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); - - public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { - long s = 1; //fixes leading 0's bug - for (int i = 0; i < xt.length; i++) { - s = s << 1 ; // left shift the bits of s by 1. - if (xt[i] > rngvec[i]) - s= s+1; - - if (MapOfIDAndCent.containsKey(s)) { - - float CurrentCent [] = MapOfIDAndCent.get(s); - float IncomingVector [] = x; - - - float currentWcss= 0; - - if (IDandWCSS_offline.containsKey(s)) { - currentWcss= IDandWCSS_offline.get(s); - } - - float[][] MergedValues = UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector ); - - float wcss= MergedValues[2][0]; - - IDandWCSS_offline.put(s, wcss); - - } - } - return s; - } - + /* * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid * vector map @@ -258,20 +218,6 @@ void addtocounter(float[] x, Projector p, hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); } - // this method is used to compute the offline WCSS to choose the best of the clusters - //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); - - void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { - - float[] xt = p.project(x); - - hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); - - } - - static boolean isPowerOfTwo(long num) { - return (num & -num) == num; - } /* * X - data set k - canonical k in k-means l - clustering sub-space Compute @@ -320,6 +266,13 @@ public Multimap findDensityModes2() { HashMap MapOfIDAndCount10 = new HashMap<>(); HashMap MapOfIDAndWCSS10 = new HashMap<>(); + HashMap MapOfIDAndCent11 = new HashMap<>(); + HashMap MapOfIDAndCount11 = new HashMap<>(); + HashMap MapOfIDAndWCSS11 = new HashMap<>(); + + HashMap MapOfIDAndCent12 = new HashMap<>(); + HashMap MapOfIDAndCount12 = new HashMap<>(); + HashMap MapOfIDAndWCSS12 = new HashMap<>(); // #create projector matrixs @@ -328,8 +281,24 @@ public Multimap findDensityModes2() { projector.setProjectedDim(so.getDimparameter()); projector.setRandomSeed(so.getRandomSeed()); // projector.setRandomSeed(535247432); - projector.init(); + + // #create projector matrixs + Projector projector2 = so.getProjectionType(); + projector2.setOrigDim(so.getdim()); + projector2.setProjectedDim(so.getDimparameter()); + projector2.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector2.init(); + + // #create projector matrixs + Projector projector3 = so.getProjectionType(); + projector3.setOrigDim(so.getdim()); + projector3.setProjectedDim(so.getDimparameter()); + projector3.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector3.init(); + int cutoff = so.getCutoff(); int ct = 0; @@ -342,13 +311,17 @@ public Multimap findDensityModes2() { addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); - addtocounter(x, projector, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); - addtocounter(x, projector, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); - addtocounter(x, projector, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); - addtocounter(x, projector, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); - addtocounter(x, projector, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); - addtocounter(x, projector, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector2, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + addtocounter(x, projector2, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector2, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector2, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + + addtocounter(x, projector3, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector3, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector3, MapOfIDAndCent11, MapOfIDAndCount11,ct++, rngvec11,MapOfIDAndWCSS11); + addtocounter(x, projector3, MapOfIDAndCent12, MapOfIDAndCount12,ct++, rngvec12,MapOfIDAndWCSS12); + } } @@ -687,7 +660,69 @@ public Multimap findDensityModes2() { } + HashMap denseSetOfIDandCount2_11 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount11.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount11.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount11.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_11.put(parent_id, 0L); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_11.remove(parent_id); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + } + } + } + } + } + HashMap denseSetOfIDandCount2_12 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount12.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount12.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount12.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_12.put(parent_id, 0L); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_12.remove(parent_id); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + } + } + } + } + } //remove keys with support less than 1 Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); @@ -697,7 +732,6 @@ public Multimap findDensityModes2() { .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); - Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); List sortedIDList2_2= new ArrayList<>(); // sort and limit the list @@ -725,7 +759,6 @@ public Multimap findDensityModes2() { .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); - Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); List sortedIDList2_6= new ArrayList<>(); // sort and limit the list @@ -733,7 +766,6 @@ public Multimap findDensityModes2() { .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); - Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); List sortedIDList2_7= new ArrayList<>(); // sort and limit the list @@ -741,7 +773,6 @@ public Multimap findDensityModes2() { .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); - Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); List sortedIDList2_8= new ArrayList<>(); // sort and limit the list @@ -749,7 +780,6 @@ public Multimap findDensityModes2() { .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); - Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); List sortedIDList2_9= new ArrayList<>(); // sort and limit the list @@ -757,75 +787,82 @@ public Multimap findDensityModes2() { .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); - Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); List sortedIDList2_10= new ArrayList<>(); // sort and limit the list stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); - - - - float WCSS1 = 0; - float WCSS2 = 0; - float WCSS3 = 0; - float WCSS4 = 0; - float WCSS5 = 0; - float WCSS6 = 0; - float WCSS7 = 0; - float WCSS8 = 0; - float WCSS9 = 0; + + Stream> stream2_11 = denseSetOfIDandCount2_11.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_11= new ArrayList<>(); + // sort and limit the list + stream2_11.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_11.add(x.getKey())); + + Stream> stream2_12 = denseSetOfIDandCount2_12.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_12= new ArrayList<>(); + // sort and limit the list + stream2_12.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_12.add(x.getKey())); + +// finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + num_of_clusters_stage1[0]= elbowcalculator.find_elbow(sortedIDList2_1); + num_of_clusters_stage1[1]= elbowcalculator.find_elbow(sortedIDList2_2); + num_of_clusters_stage1[2]= elbowcalculator.find_elbow(sortedIDList2_3); + num_of_clusters_stage1[3]= elbowcalculator.find_elbow(sortedIDList2_4); + num_of_clusters_stage1[4]= elbowcalculator.find_elbow(sortedIDList2_5); + num_of_clusters_stage1[5]= elbowcalculator.find_elbow(sortedIDList2_6); + num_of_clusters_stage1[6]= elbowcalculator.find_elbow(sortedIDList2_7); + num_of_clusters_stage1[7]= elbowcalculator.find_elbow(sortedIDList2_8); + num_of_clusters_stage1[8]= elbowcalculator.find_elbow(sortedIDList2_9); + num_of_clusters_stage1[9]= elbowcalculator.find_elbow(sortedIDList2_10); + num_of_clusters_stage1[10]= elbowcalculator.find_elbow(sortedIDList2_11); + num_of_clusters_stage1[11]= elbowcalculator.find_elbow(sortedIDList2_12); + + for (int i=0 ; i<12; i++) { + + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); + System.out.println("\n" + "No. of clusters_stage1 = " + num_of_clusters_stage1[i]); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); + System.out.println( "************************************************************" ); + sum_jt = sum_jt + num_of_clusters_stage1[i]; + } + System.out.println("\n" + "sum of No. of clusters_stage1 = " + sum_jt); + + double avg_clus_stage1 = Math.ceil(sum_jt / 12.0) ; + System.out.println("\n" + "Average of No. of clusters_stage1 = " + avg_clus_stage1 ); + System.out.println( "************************************************************" ); + // finding the range of K for offline clustering : + min_k = (int) (avg_clus_stage1 - 3) ; + + max_k = (int) (avg_clus_stage1 + 3) ; + + if (min_k < 3 ) { min_k = 3 ; max_k = 9; } ; + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; float WCSS10 = 0; + float WCSS11 = 0; + float WCSS12 = 0; - float WCSS_off_1 = 0; - float WCSS_off_2 = 0; - float WCSS_off_3 = 0; - float WCSS_off_4 = 0; - float WCSS_off_5 = 0; - float WCSS_off_6 = 0; - float WCSS_off_7 = 0; - float WCSS_off_8 = 0; - float WCSS_off_9 = 0; - float WCSS_off_10 = 0; HashMap denseSetOfIDandCount2 = new HashMap(); HashMap MapOfIDAndCent = new HashMap<>(); HashMap MapOfIDAndCount = new HashMap<>(); HashMap MapOfIDAndWCSS = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_4 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_5 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_6 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_7 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_8 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_9 = new HashMap<>(); - HashMap MapOfIDAandWCSS_offline_10 = new HashMap<>(); - - - // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets - // and calculate the wcss as we know their centroids : - - - for (float[] x : so.getRawData()) - { - - calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec , MapOfIDAandWCSS_offline_1); - calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); - calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); - calcWCSSoffline(x, projector, MapOfIDAndCent4, rngvec4, MapOfIDAandWCSS_offline_4); - calcWCSSoffline(x, projector, MapOfIDAndCent5, rngvec5, MapOfIDAandWCSS_offline_5); - - calcWCSSoffline(x, projector, MapOfIDAndCent6, rngvec6 , MapOfIDAandWCSS_offline_6); - calcWCSSoffline(x, projector, MapOfIDAndCent7, rngvec7, MapOfIDAandWCSS_offline_7); - calcWCSSoffline(x, projector, MapOfIDAndCent8, rngvec8, MapOfIDAandWCSS_offline_8); - calcWCSSoffline(x, projector, MapOfIDAndCent9, rngvec9, MapOfIDAandWCSS_offline_9); - calcWCSSoffline(x, projector, MapOfIDAndCent10, rngvec10, MapOfIDAandWCSS_offline_10); - } - //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: @@ -859,74 +896,41 @@ public Multimap findDensityModes2() { for (Long keys: sortedIDList2_10) { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} -//* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: + for (Long keys: sortedIDList2_11) + { WCSS11 = WCSS11 + MapOfIDAndWCSS11.get(keys);} + + for (Long keys: sortedIDList2_12) + { WCSS12 = WCSS12 + MapOfIDAndWCSS12.get(keys);} - for (Long keys: sortedIDList2_1) - { WCSS_off_1 = WCSS_off_1 + MapOfIDAandWCSS_offline_1.get(keys);} - for (Long keys: sortedIDList2_2) - { WCSS_off_2 = WCSS_off_2 + MapOfIDAandWCSS_offline_2.get(keys);} - - for (Long keys: sortedIDList2_3) - { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} - - for (Long keys: sortedIDList2_4) - { WCSS_off_4 = WCSS_off_4 + MapOfIDAandWCSS_offline_4.get(keys);} - - for (Long keys: sortedIDList2_5) - { WCSS_off_5 = WCSS_off_5 + MapOfIDAandWCSS_offline_5.get(keys);} - - for (Long keys: sortedIDList2_6) - { WCSS_off_6 = WCSS_off_6 + MapOfIDAandWCSS_offline_6.get(keys);} - - for (Long keys: sortedIDList2_7) - { WCSS_off_7 = WCSS_off_7 + MapOfIDAandWCSS_offline_7.get(keys);} - - for (Long keys: sortedIDList2_8) - { WCSS_off_8 = WCSS_off_8 + MapOfIDAandWCSS_offline_8.get(keys);} - - for (Long keys: sortedIDList2_9) - { WCSS_off_9 = WCSS_off_9 + MapOfIDAandWCSS_offline_9.get(keys);} - - for (Long keys: sortedIDList2_10) - { WCSS_off_10 = WCSS_off_10 + MapOfIDAandWCSS_offline_10.get(keys);} - - - - System.out.print("wcss1 = " + WCSS1); - System.out.println(" wcss_ofline_1 = " + WCSS_off_1); + System.out.print(" wcss1 = " + WCSS1); - System.out.print("wcss2 = " + WCSS2); - System.out.println(" wcss_ofline_2 = " + WCSS_off_2); + System.out.print(" wcss2 = " + WCSS2); - System.out.print("wcss3 = " + WCSS3); - System.out.println(" wcss_ofline_3 = " + WCSS_off_3); + System.out.print(" wcss3 = " + WCSS3); - System.out.print("wcss4 = " + WCSS4); - System.out.println(" wcss_ofline_4 = " + WCSS_off_4); + System.out.print(" wcss4 = " + WCSS4); - System.out.print("wcss5 = " + WCSS5); - System.out.println(" wcss_ofline_5 = " + WCSS_off_5); + System.out.print(" wcss5 = " + WCSS5); - System.out.print("wcss6 = " + WCSS6); - System.out.println(" wcss_ofline_6 = " + WCSS_off_6); + System.out.print(" wcss6 = " + WCSS6); - System.out.print("wcss7 = " + WCSS7); - System.out.println(" wcss_ofline_7 = " + WCSS_off_7); + System.out.print(" wcss7 = " + WCSS7); - System.out.print("wcss8 = " + WCSS8); - System.out.println(" wcss_ofline_8 = " + WCSS_off_8); + System.out.print(" wcss8 = " + WCSS8); - System.out.print("wcss9 = " + WCSS9); - System.out.println(" wcss_ofline_9 = " + WCSS_off_9); + System.out.print(" wcss9 = " + WCSS9); - System.out.print("wcss10 = " + WCSS10); - System.out.println(" wcss_ofline_10 = " + WCSS_off_10); + System.out.print(" wcss10 = " + WCSS10); + System.out.print(" wcss11 = " + WCSS11); + System.out.print(" wcss12 = " + WCSS12); + +// float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + float arr[] = {WCSS1,WCSS2,WCSS3,WCSS4,WCSS5,WCSS6,WCSS7,WCSS8,WCSS9,WCSS10, WCSS11, WCSS12 }; - float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; int index_of_max = smallest(arr); if (index_of_max == 0){ @@ -1000,6 +1004,21 @@ public Multimap findDensityModes2() { denseSetOfIDandCount2 = denseSetOfIDandCount2_10; System.out.println("winner = tree10"); } + if (index_of_max == 10) { + MapOfIDAndCount = MapOfIDAndCount11; + MapOfIDAndCent = MapOfIDAndCent11; + MapOfIDAndWCSS = MapOfIDAndWCSS11; + denseSetOfIDandCount2 = denseSetOfIDandCount2_11; + System.out.println("winner = tree11"); + } + if (index_of_max == 11) { + MapOfIDAndCount = MapOfIDAndCount12; + MapOfIDAndCent = MapOfIDAndCent12; + MapOfIDAndWCSS = MapOfIDAndWCSS12; + denseSetOfIDandCount2 = denseSetOfIDandCount2_12; + System.out.println("winner = tree12"); + } + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); @@ -1022,362 +1041,6 @@ public Multimap findDensityModes2() { } - // this is to be taken out . only done for hypothesis testing. - - boolean raw = Boolean.parseBoolean(("raw")); - List data = null; - try { - data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); - } catch (FileNotFoundException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - - Multimap multimapWeightAndCent1 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_1) - { - multimapWeightAndCent1.put((Long)(MapOfIDAndCount1.get(keys)), (float[]) (MapOfIDAndCent1.get(keys))); - } - - Multimap multimapWeightAndCent2 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_2) - { - multimapWeightAndCent2.put((Long)(MapOfIDAndCount2.get(keys)), (float[]) (MapOfIDAndCent2.get(keys))); - } - - Multimap multimapWeightAndCent3 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_3) - { - multimapWeightAndCent3.put((Long)(MapOfIDAndCount3.get(keys)), (float[]) (MapOfIDAndCent3.get(keys))); - } - - Multimap multimapWeightAndCent4 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_4) - { - multimapWeightAndCent4.put((Long)(MapOfIDAndCount4.get(keys)), (float[]) (MapOfIDAndCent4.get(keys))); - } - - Multimap multimapWeightAndCent5 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_5) - { - multimapWeightAndCent5.put((Long)(MapOfIDAndCount5.get(keys)), (float[]) (MapOfIDAndCent5.get(keys))); - } - - Multimap multimapWeightAndCent6 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_6) - { - multimapWeightAndCent6.put((Long)(MapOfIDAndCount6.get(keys)), (float[]) (MapOfIDAndCent6.get(keys))); - } - Multimap multimapWeightAndCent7 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_7) - { - multimapWeightAndCent7.put((Long)(MapOfIDAndCount7.get(keys)), (float[]) (MapOfIDAndCent7.get(keys))); - } - - Multimap multimapWeightAndCent8 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_8) - { - multimapWeightAndCent8.put((Long)(MapOfIDAndCount8.get(keys)), (float[]) (MapOfIDAndCent8.get(keys))); - } - - Multimap multimapWeightAndCent9 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_9) - { - multimapWeightAndCent9.put((Long)(MapOfIDAndCount9.get(keys)), (float[]) (MapOfIDAndCent9.get(keys))); - } - - Multimap multimapWeightAndCent10 = ArrayListMultimap.create(); - for (Long keys: sortedIDList2_10) - { - multimapWeightAndCent10.put((Long)(MapOfIDAndCount10.get(keys)), (float[]) (MapOfIDAndCent10.get(keys))); - } - - - - Listcentroids1 = new ArrayList<>(); - List weights1 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent1.keys()) - { - weights1.add((float)weights); - } - - for (Long weight : multimapWeightAndCent1.keySet()) - - { - centroids1.addAll(multimapWeightAndCent1.get(weight)); - } - -// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids1, so.getk()); -// aggloOffline.setWeights(weights1); -// List finalcentroids_1 = aggloOffline.getCentroids(); - - KMeans2 Offline = new KMeans2(); - Offline.setK(so.getk()); - Offline.setRawData(centroids1); - Offline.setWeights(weights1); - List finalcentroids_1 = Offline.getCentroids(); - - - - Listcentroids2 = new ArrayList<>(); - List weights2 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent2.keys()) - { - weights2.add((float)weights); - } - - for (Long weight : multimapWeightAndCent2.keySet()) - - { - centroids2.addAll(multimapWeightAndCent1.get(weight)); - } - -// Agglomerative3 aggloOffline2 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); -// aggloOffline2.setWeights(weights2); -// List finalcentroids_2 = aggloOffline2.getCentroids(); - - KMeans2 Offline2 = new KMeans2(); - Offline2.setK(so.getk()); - Offline2.setRawData(centroids2); - Offline2.setWeights(weights2); - List finalcentroids_2 = Offline2.getCentroids(); - - - Listcentroids3 = new ArrayList<>(); - List weights3 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent3.keys()) - { - weights3.add((float)weights); - } - - for (Long weight : multimapWeightAndCent3.keySet()) - - { - centroids3.addAll(multimapWeightAndCent3.get(weight)); - } - -// Agglomerative3 aggloOffline3 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids3, so.getk()); -// aggloOffline3.setWeights(weights3); -// List finalcentroids_3 = aggloOffline3.getCentroids(); - - KMeans2 Offline3 = new KMeans2(); - Offline3.setK(so.getk()); - Offline3.setRawData(centroids3); - Offline3.setWeights(weights3); - List finalcentroids_3 = Offline3.getCentroids(); - - Listcentroids4 = new ArrayList<>(); - List weights4 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent4.keys()) - { - weights4.add((float)weights); - } - - for (Long weight : multimapWeightAndCent4.keySet()) - - { - centroids4.addAll(multimapWeightAndCent4.get(weight)); - } - -// Agglomerative3 aggloOffline4 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids4, so.getk()); -// aggloOffline4.setWeights(weights4); -// List finalcentroids_4 = aggloOffline4.getCentroids(); - - KMeans2 Offline4 = new KMeans2(); - Offline4.setK(so.getk()); - Offline4.setRawData(centroids4); - Offline4.setWeights(weights4); - List finalcentroids_4 = Offline4.getCentroids(); - - - Listcentroids5 = new ArrayList<>(); - List weights5 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent5.keys()) - { - weights5.add((float)weights); - } - - for (Long weight : multimapWeightAndCent5.keySet()) - - { - centroids5.addAll(multimapWeightAndCent5.get(weight)); - } - -// Agglomerative3 aggloOffline5 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids5, so.getk()); -// aggloOffline5.setWeights(weights5); -// List finalcentroids_5 = aggloOffline5.getCentroids(); - - KMeans2 Offline5 = new KMeans2(); - Offline5.setK(so.getk()); - Offline5.setRawData(centroids5); - Offline5.setWeights(weights5); - List finalcentroids_5 = Offline5.getCentroids(); - - - Listcentroids6 = new ArrayList<>(); - List weights6 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent6.keys()) - { - weights6.add((float)weights); - } - - for (Long weight : multimapWeightAndCent6.keySet()) - - { - centroids6.addAll(multimapWeightAndCent6.get(weight)); - } - -// Agglomerative3 aggloOffline6 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids6, so.getk()); -// aggloOffline6.setWeights(weights6); -// List finalcentroids_6 = aggloOffline6.getCentroids(); - - KMeans2 Offline6 = new KMeans2(); - Offline6.setK(so.getk()); - Offline6.setRawData(centroids6); - Offline6.setWeights(weights6); - List finalcentroids_6 = Offline6.getCentroids(); - - Listcentroids7 = new ArrayList<>(); - List weights7 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent7.keys()) - { - weights7.add((float)weights); - } - - for (Long weight : multimapWeightAndCent7.keySet()) - - { - centroids7.addAll(multimapWeightAndCent7.get(weight)); - } - -// Agglomerative3 aggloOffline7 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids7, so.getk()); -// aggloOffline7.setWeights(weights7); -// List finalcentroids_7 = aggloOffline7.getCentroids(); - - KMeans2 Offline7 = new KMeans2(); - Offline7.setK(so.getk()); - Offline7.setRawData(centroids7); - Offline7.setWeights(weights7); - List finalcentroids_7 = Offline7.getCentroids(); - - - Listcentroids8 = new ArrayList<>(); - List weights8 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent8.keys()) - { - weights8.add((float)weights); - } - - for (Long weight : multimapWeightAndCent8.keySet()) - - { - centroids8.addAll(multimapWeightAndCent8.get(weight)); - } - -// Agglomerative3 aggloOffline8 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids8, so.getk()); -// aggloOffline8.setWeights(weights8); -// List finalcentroids_8 = aggloOffline8.getCentroids(); - - KMeans2 Offline8 = new KMeans2(); - Offline8.setK(so.getk()); - Offline8.setRawData(centroids8); - Offline8.setWeights(weights8); - List finalcentroids_8 = Offline8.getCentroids(); - - - Listcentroids9 = new ArrayList<>(); - List weights9 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent9.keys()) - { - weights9.add((float)weights); - } - - for (Long weight : multimapWeightAndCent9.keySet()) - - { - centroids9.addAll(multimapWeightAndCent9.get(weight)); - } - -// Agglomerative3 aggloOffline9 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids9, so.getk()); -// aggloOffline9.setWeights(weights9); -// List finalcentroids_9 = aggloOffline9.getCentroids(); - - KMeans2 Offline9 = new KMeans2(); - Offline9.setK(so.getk()); - Offline9.setRawData(centroids9); - Offline9.setWeights(weights9); - List finalcentroids_9 = Offline9.getCentroids(); - - - Listcentroids10 = new ArrayList<>(); - List weights10 =new ArrayList<>(); - for (Long weights : multimapWeightAndCent10.keys()) - { - weights10.add((float)weights); - } - - for (Long weight : multimapWeightAndCent10.keySet()) - - { - centroids10.addAll(multimapWeightAndCent10.get(weight)); - } - -// Agglomerative3 aggloOffline10 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids10, so.getk()); -// aggloOffline10.setWeights(weights10); -// List finalcentroids_10 = aggloOffline10.getCentroids(); - - KMeans2 Offline10 = new KMeans2(); - Offline10.setK(so.getk()); - Offline10.setRawData(centroids10); - Offline10.setWeights(weights10); - List finalcentroids_10 = Offline10.getCentroids(); - - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree1"),finalcentroids_1, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_1, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree2"),finalcentroids_2, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_2, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree3"),finalcentroids_3, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_3, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree4"),finalcentroids_4, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_4, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree5"),finalcentroids_5, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_5, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree6"),finalcentroids_6, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_6, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree7"),finalcentroids_7, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_7, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree8"),finalcentroids_8, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_8, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree9"),finalcentroids_9, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_9, data)); - - VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_tree10"),finalcentroids_10, false); - - System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_10, data)); - - - return multimapWeightAndCent; } @@ -1394,6 +1057,9 @@ public void run() { rngvec8 = new float[so.getDimparameter()]; rngvec9 = new float[so.getDimparameter()]; rngvec10 = new float[so.getDimparameter()]; + rngvec11 = new float[so.getDimparameter()]; + rngvec12 = new float[so.getDimparameter()]; + counter = 0; boolean randVect = so.getRandomVector(); @@ -1409,6 +1075,8 @@ public void run() { Random r8 = new Random(); Random r9 = new Random(); Random r10 = new Random(); + Random r11 = new Random(); + Random r12 = new Random(); if (randVect==true){ for (int i = 0; i < so.getDimparameter(); i++) @@ -1431,6 +1099,11 @@ public void run() { rngvec9[i] = (float) r9.nextGaussian(); for (int i = 0; i < so.getDimparameter(); i++) rngvec10[i] = (float) r10.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r11.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r12.nextGaussian(); + } else { @@ -1464,37 +1137,67 @@ public void run() { centroids2.addAll(WeightAndClusters.get(weight)); } - + // Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); // aggloOffline.setWeights(weights2); // this.centroids = aggloOffline.getCentroids(); KMeans2 aggloOffline2 = new KMeans2(); - aggloOffline2.setK(so.getk()); aggloOffline2.setRawData(centroids2); aggloOffline2.setWeights(weights2); - this.centroids = aggloOffline2.getCentroids(); + + + List data1 = null; + data1 = so.getRawData(); + + List elbow_wcss = new ArrayList<>(); + + for (int k=min_k; k<=max_k ;k++) { + + aggloOffline2.setK(k); + List cent1 = aggloOffline2.getCentroids(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cent1, data1)); + long tempu = (long) StatTests.WCSSECentroidsFloat(cent1, data1); + elbow_wcss.add(tempu); + } + + + // finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + int num_of_clusters_stage2 = min_k + elbowcalculator.find_elbow(elbow_wcss); + + System.out.println("\n" + "No. of clusters_stage_2_Final = " + num_of_clusters_stage2 ); + + System.out.println("\n" + "No. of Data Points = " + so.getRawData().size() ); + + + // final choice of centroids : ( repetative calculation , please optimize ) + aggloOffline2.setK(num_of_clusters_stage2); + + this.centroids = aggloOffline2.getCentroids(); } public static void main(String[] args) throws FileNotFoundException, - IOException { + IOException , InterruptedException { - int k = 10;//6; - int d = 200;//16; - int n = 10000; - float var = 1.5f; - int count = 1; + // int k = 10;//6; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; // System.out.printf("ClusterVar\t"); // for (int i = 0; i < count; i++) // System.out.printf("Trial%d\t", i); // System.out.printf("RealWCSS\n"); - String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_mainfunc_1" ; + // String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_mainfunc_1" ; - float f = var; + // float f = var; float avgrealwcss = 0; float avgtime = 0; // System.out.printf("%f\t", f); @@ -1507,9 +1210,15 @@ public static void main(String[] args) throws FileNotFoundException, boolean raw = Boolean.parseBoolean(("raw")); List data = null; - data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); - k = 6; - RPHashObject o = new SimpleArrayReader(data, 6); + // "C:\Users\sayan\OneDrive - University of Cincinnati\Documents\downloaded\run_results\run_results\3runs\har_k6\1D.txt" + + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" ; + System.out.println(inputfile); + + data = VectorUtil.readFile( inputfile , raw); + + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data, dummyk); o.setDimparameter(16); @@ -1527,8 +1236,8 @@ public static void main(String[] args) throws FileNotFoundException, // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); - - VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +"_"+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); // System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingcents.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingcents.java new file mode 100644 index 0000000..018da2c --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingcents.java @@ -0,0 +1,1419 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; +import edu.uc.rphash.aging.ageCentriods; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 10 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_10runs_agingcents implements Clusterer, Runnable { + + boolean znorm = false; + int [] num_of_clusters_stage1 = new int[12]; + int min_k; + int max_k; + + // convert this to an array of arrays + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + private float[] rngvec6; + private float[] rngvec7; + private float[] rngvec8; + private float[] rngvec9; + private float[] rngvec10; + private float[] rngvec11; + private float[] rngvec12; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_10runs_agingcents(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + HashMap MapOfIDAndCent6 = new HashMap<>(); + HashMap MapOfIDAndCount6 = new HashMap<>(); + HashMap MapOfIDAndWCSS6 = new HashMap<>(); + + HashMap MapOfIDAndCent7 = new HashMap<>(); + HashMap MapOfIDAndCount7 = new HashMap<>(); + HashMap MapOfIDAndWCSS7 = new HashMap<>(); + + HashMap MapOfIDAndCent8 = new HashMap<>(); + HashMap MapOfIDAndCount8 = new HashMap<>(); + HashMap MapOfIDAndWCSS8 = new HashMap<>(); + + HashMap MapOfIDAndCent9 = new HashMap<>(); + HashMap MapOfIDAndCount9 = new HashMap<>(); + HashMap MapOfIDAndWCSS9 = new HashMap<>(); + + HashMap MapOfIDAndCent10 = new HashMap<>(); + HashMap MapOfIDAndCount10 = new HashMap<>(); + HashMap MapOfIDAndWCSS10 = new HashMap<>(); + + HashMap MapOfIDAndCent11 = new HashMap<>(); + HashMap MapOfIDAndCount11 = new HashMap<>(); + HashMap MapOfIDAndWCSS11 = new HashMap<>(); + + HashMap MapOfIDAndCent12 = new HashMap<>(); + HashMap MapOfIDAndCount12 = new HashMap<>(); + HashMap MapOfIDAndWCSS12 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector.init(); + + // #create projector matrixs + Projector projector2 = so.getProjectionType(); + projector2.setOrigDim(so.getdim()); + projector2.setProjectedDim(so.getDimparameter()); + projector2.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector2.init(); + + // #create projector matrixs + Projector projector3 = so.getProjectionType(); + projector3.setOrigDim(so.getdim()); + projector3.setProjectedDim(so.getDimparameter()); + projector3.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector3.init(); + + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + + addtocounter(x, projector2, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + addtocounter(x, projector2, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector2, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector2, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + + addtocounter(x, projector3, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector3, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector3, MapOfIDAndCent11, MapOfIDAndCount11,ct++, rngvec11,MapOfIDAndWCSS11); + addtocounter(x, projector3, MapOfIDAndCent12, MapOfIDAndCount12,ct++, rngvec12,MapOfIDAndWCSS12); + + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_6 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount6.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount6.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount6.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_6.put(parent_id, 0L); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_6.remove(parent_id); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_7 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount7.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount7.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount7.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_7.put(parent_id, 0L); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_7.remove(parent_id); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_8 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount8.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount8.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount8.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_8.put(parent_id, 0L); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_8.remove(parent_id); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_9 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount9.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount9.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount9.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_9.put(parent_id, 0L); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_9.remove(parent_id); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_10 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount10.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount10.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount10.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_10.put(parent_id, 0L); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_10.remove(parent_id); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_11 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount11.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount11.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount11.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_11.put(parent_id, 0L); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_11.remove(parent_id); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_12 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount12.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount12.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount12.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_12.put(parent_id, 0L); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_12.remove(parent_id); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + } + } + } + } + } + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_6= new ArrayList<>(); + // sort and limit the list + stream2_6.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); + + + Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_7= new ArrayList<>(); + // sort and limit the list + stream2_7.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); + + + Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_8= new ArrayList<>(); + // sort and limit the list + stream2_8.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); + + + Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_9= new ArrayList<>(); + // sort and limit the list + stream2_9.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); + + + Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_10= new ArrayList<>(); + // sort and limit the list + stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); + + + Stream> stream2_11 = denseSetOfIDandCount2_11.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_11= new ArrayList<>(); + // sort and limit the list + stream2_11.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_11.add(x.getKey())); + + Stream> stream2_12 = denseSetOfIDandCount2_12.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_12= new ArrayList<>(); + // sort and limit the list + stream2_12.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_12.add(x.getKey())); + +// finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + num_of_clusters_stage1[0]= elbowcalculator.find_elbow(sortedIDList2_1); + num_of_clusters_stage1[1]= elbowcalculator.find_elbow(sortedIDList2_2); + num_of_clusters_stage1[2]= elbowcalculator.find_elbow(sortedIDList2_3); + num_of_clusters_stage1[3]= elbowcalculator.find_elbow(sortedIDList2_4); + num_of_clusters_stage1[4]= elbowcalculator.find_elbow(sortedIDList2_5); + num_of_clusters_stage1[5]= elbowcalculator.find_elbow(sortedIDList2_6); + num_of_clusters_stage1[6]= elbowcalculator.find_elbow(sortedIDList2_7); + num_of_clusters_stage1[7]= elbowcalculator.find_elbow(sortedIDList2_8); + num_of_clusters_stage1[8]= elbowcalculator.find_elbow(sortedIDList2_9); + num_of_clusters_stage1[9]= elbowcalculator.find_elbow(sortedIDList2_10); + num_of_clusters_stage1[10]= elbowcalculator.find_elbow(sortedIDList2_11); + num_of_clusters_stage1[11]= elbowcalculator.find_elbow(sortedIDList2_12); + + for (int i=0 ; i<12; i++) { + + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); +//// System.out.println("\n" + "No. of clusters_stage1 = " + num_of_clusters_stage1[i]); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); +//// System.out.println( "************************************************************" ); + sum_jt = sum_jt + num_of_clusters_stage1[i]; + } + System.out.println("\n" + "sum of No. of clusters_stage1 = " + sum_jt); + + double avg_clus_stage1 = Math.ceil(sum_jt / 12.0) ; + System.out.println("\n" + "Average of No. of clusters_stage1 = " + avg_clus_stage1 ); + System.out.println( "************************************************************" ); + // finding the range of K for offline clustering : + min_k = (int) (avg_clus_stage1 - 3) ; + + max_k = (int) (avg_clus_stage1 + 3) ; + + if (min_k < 3 ) { min_k = 3 ; max_k = 9; } ; + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; + float WCSS10 = 0; + float WCSS11 = 0; + float WCSS12 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + + for (Long keys: sortedIDList2_6){ + WCSS6 = WCSS6 + MapOfIDAndWCSS6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS7 = WCSS7 + MapOfIDAndWCSS7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS8 = WCSS8 + MapOfIDAndWCSS8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS9 = WCSS9 + MapOfIDAndWCSS9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} + + for (Long keys: sortedIDList2_11) + { WCSS11 = WCSS11 + MapOfIDAndWCSS11.get(keys);} + + for (Long keys: sortedIDList2_12) + { WCSS12 = WCSS12 + MapOfIDAndWCSS12.get(keys);} + + + + System.out.print(" wcss1 = " + WCSS1); + + System.out.print(" wcss2 = " + WCSS2); + + System.out.print(" wcss3 = " + WCSS3); + + System.out.print(" wcss4 = " + WCSS4); + + System.out.print(" wcss5 = " + WCSS5); + + System.out.print(" wcss6 = " + WCSS6); + + System.out.print(" wcss7 = " + WCSS7); + + System.out.print(" wcss8 = " + WCSS8); + + System.out.print(" wcss9 = " + WCSS9); + + System.out.print(" wcss10 = " + WCSS10); + + System.out.print(" wcss11 = " + WCSS11); + + System.out.print(" wcss12 = " + WCSS12); + +// float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + float arr[] = {WCSS1,WCSS2,WCSS3,WCSS4,WCSS5,WCSS6,WCSS7,WCSS8,WCSS9,WCSS10, WCSS11, WCSS12 }; + + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + if (index_of_max == 5) { + MapOfIDAndCount = MapOfIDAndCount6; + MapOfIDAndCent = MapOfIDAndCent6; + MapOfIDAndWCSS = MapOfIDAndWCSS6; + denseSetOfIDandCount2 = denseSetOfIDandCount2_6; + System.out.println("winner = tree6"); + } + if (index_of_max == 6) { + MapOfIDAndCount = MapOfIDAndCount7; + MapOfIDAndCent = MapOfIDAndCent7; + MapOfIDAndWCSS = MapOfIDAndWCSS7; + denseSetOfIDandCount2 = denseSetOfIDandCount2_7; + System.out.println("winner = tree7"); + } + if (index_of_max == 7) { + MapOfIDAndCount = MapOfIDAndCount8; + MapOfIDAndCent = MapOfIDAndCent8; + MapOfIDAndWCSS = MapOfIDAndWCSS8; + denseSetOfIDandCount2 = denseSetOfIDandCount2_8; + System.out.println("winner = tree8"); + } + if (index_of_max == 8) { + MapOfIDAndCount = MapOfIDAndCount9; + MapOfIDAndCent = MapOfIDAndCent9; + MapOfIDAndWCSS = MapOfIDAndWCSS9; + denseSetOfIDandCount2 = denseSetOfIDandCount2_9; + System.out.println("winner = tree9"); + } + if (index_of_max == 9) { + MapOfIDAndCount = MapOfIDAndCount10; + MapOfIDAndCent = MapOfIDAndCent10; + MapOfIDAndWCSS = MapOfIDAndWCSS10; + denseSetOfIDandCount2 = denseSetOfIDandCount2_10; + System.out.println("winner = tree10"); + } + if (index_of_max == 10) { + MapOfIDAndCount = MapOfIDAndCount11; + MapOfIDAndCent = MapOfIDAndCent11; + MapOfIDAndWCSS = MapOfIDAndWCSS11; + denseSetOfIDandCount2 = denseSetOfIDandCount2_11; + System.out.println("winner = tree11"); + } + if (index_of_max == 11) { + MapOfIDAndCount = MapOfIDAndCount12; + MapOfIDAndCent = MapOfIDAndCent12; + MapOfIDAndWCSS = MapOfIDAndWCSS12; + denseSetOfIDandCount2 = denseSetOfIDandCount2_12; + System.out.println("winner = tree12"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + + rngvec6 = new float[so.getDimparameter()]; + rngvec7 = new float[so.getDimparameter()]; + rngvec8 = new float[so.getDimparameter()]; + rngvec9 = new float[so.getDimparameter()]; + rngvec10 = new float[so.getDimparameter()]; + rngvec11 = new float[so.getDimparameter()]; + rngvec12 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + Random r6 = new Random(); + Random r7 = new Random(); + Random r8 = new Random(); + Random r9 = new Random(); + Random r10 = new Random(); + Random r11 = new Random(); + Random r12 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec6[i] = (float) r6.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec7[i] = (float) r7.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec8[i] = (float) r8.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec9[i] = (float) r9.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec10[i] = (float) r10.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r11.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r12.nextGaussian(); + + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + // System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + + + List data1 = null; + data1 = so.getRawData(); + //System.out.println("\n" + "No. of Data Points = " + so.getRawData().size() ); + System.out.println("\n" + "No. of Data Points = " + data1.size() ); + + List elbow_wcss = new ArrayList<>(); + + for (int k=min_k; k<=max_k ;k++) { + + aggloOffline2.setK(k); + List cent1 = aggloOffline2.getCentroids(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cent1, data1)); + long tempu = (long) StatTests.WCSSECentroidsFloat(cent1, data1); + elbow_wcss.add(tempu); + } + + + // finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + int num_of_clusters_stage2 = min_k + elbowcalculator.find_elbow(elbow_wcss); + + System.out.println("\n" + "No. of clusters_stage_2_Final = " + num_of_clusters_stage2 ); + + + // final choice of centroids : ( repetative calculation , please optimize ) + aggloOffline2.setK(num_of_clusters_stage2); + + this.centroids = aggloOffline2.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException , InterruptedException { + + // int k = 10;//6; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + // String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_mainfunc_1" ; + + // float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + List data_in_round = new ArrayList() ; + // "C:\Users\sayan\OneDrive - University of Cincinnati\Documents\downloaded\run_results\run_results\3runs\har_k6\1D.txt" + + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" ; + System.out.println(inputfile); + + data = VectorUtil.readFile( inputfile , raw); + int count1=0; + int count2=0; + // List cents_aged = null ; //new ArrayList(); /// may required to be initialized + // List cents_prev_round = null ; /// may required to be properly initialized + + boolean flag = true; // indicates first round if true else is false + + + List cents_aged = null; //null ; //new ArrayList(); /// may required to be initialized + List cents_prev_round = new ArrayList() ; //null ; /// may required to be properly initialized + + int round = 0; + for (float[] element : data) + + + { + count1 = count1+1; + //System.out.println(count1); + //System.out.println(element); + data_in_round.add(data.get(count1-1)); + count2 = count2 +1; + + //System.out.println(count2); + + if (count2 >= 1000) { + //if (count2 == 10299) { + System.out.println(count2); + + round = round + 1; + System.out.println("round is : " + round + "\n" ); + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data_in_round, dummyk); + + + o.setDimparameter(16); + o.setCutoff(70); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_10runs_agingcents rphit = new TWRPv6_wcss_offline2_TEST2_10runs_agingcents(o); + long startTime = System.nanoTime(); + + + List centsr = null ; //null; // new ArrayList(); + centsr = rphit.getCentroids(); // check if overwritten ? otherwise clear + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data_in_round)); + System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); +// System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + // if prev round cents are null i.e. 1st round aged cents = cents of this round i.e no aging + // if prev round cents are there then merge aged cents with this round cents + //System.out.println(centsr); + //System.out.println(centsr.size() ); + + if (flag == true) { + + //System.out.println(cents_prev_round); + //System.out.println(centsr); + cents_prev_round = centsr ; + cents_aged = centsr; // have to modify + // Centroid.removeallobjects(cents_aged); // have to modify + flag=false ; + + } + + // cents_prev_round.clear(); + cents_prev_round = cents_aged; + + // System.out.println(centsr); + // System.out.println(cents_aged); + // System.out.println(cents_prev_round); + + int pos=0; + List test1 = new ArrayList(); + for (Centroid vector : centsr) { + // System.out.println(vector.dimensions + " dimensions " + "\n"); + pos=pos+1; + int index1 = VectorUtil.findNearestDistance(vector, cents_prev_round); + // System.out.println( " nearest one : " + index1 + "\n"); + + // call weighted_merge(float cnt_1, float[] x_1, float cnt_2, float[] x_2) + float[] current_cent = vector.centroid(); + float[] prev_mapped_cent= cents_prev_round.get(index1).centroid(); + double weight1= 1.0; + double weight2= 0.25; + float[][] ret = ageCentriods.weighted_merge( weight1 ,current_cent, weight2, prev_mapped_cent); + float[] cent_merge = ret[1] ; + Centroid test = new Centroid(cent_merge); //Centroid(float[] data) + test1.add(cent_merge); + + } + + cents_aged.clear(); + + int size = test1.size(); + for (int i=0; i<= size-1; i++ ) { + Centroid c = new Centroid(test1.get(i)); + cents_aged.add(c); }; + + System.out.println(cents_aged); + +// + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cents_aged, data_in_round)); + System.out.println("xxxxxxxxxxxxxxxx this is aged cents xxxxxxxxxxxxxxxxxxxxx"); + +// input: "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" + + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/to_del/har_k6_kmeans_70_cutoff"+"_"+round+"_"+".csv" ; + + VectorUtil.writeCentroidsToFile(new File(Output),cents_aged, false); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + data_in_round.clear(); + count2=0; + + // cents_prev_round.clear(); + // cents_prev_round = cents_aged; + + // System.out.println("ccccccccccccccccccccc this issize cccccccccccccccccccc : " + cents_prev_round.size() ); + + System.gc(); + + } // end if + + System.gc(); + + }// end for + + + System.gc(); + } // end main + + + + + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters.java new file mode 100644 index 0000000..ee58799 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters.java @@ -0,0 +1,1471 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import java.util.Collections; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; +import edu.uc.rphash.aging.ageCentriods; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + +import java.lang.*; + + + +// this algorithm runs twrp 10 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters implements Clusterer, Runnable { + + boolean znorm = false; + int [] num_of_clusters_stage1 = new int[12]; + int min_k; + int max_k; + + // convert this to an array of arrays + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + private float[] rngvec6; + private float[] rngvec7; + private float[] rngvec8; + private float[] rngvec9; + private float[] rngvec10; + private float[] rngvec11; + private float[] rngvec12; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + //private Multimap WeightsandCents ; + + +// public Multimap getMicroclusterWeightsandCents (RPHashObject so) { +// this.so = so; +// return getCentroids(); +// } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2(List data_in_round) { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + HashMap MapOfIDAndCent6 = new HashMap<>(); + HashMap MapOfIDAndCount6 = new HashMap<>(); + HashMap MapOfIDAndWCSS6 = new HashMap<>(); + + HashMap MapOfIDAndCent7 = new HashMap<>(); + HashMap MapOfIDAndCount7 = new HashMap<>(); + HashMap MapOfIDAndWCSS7 = new HashMap<>(); + + HashMap MapOfIDAndCent8 = new HashMap<>(); + HashMap MapOfIDAndCount8 = new HashMap<>(); + HashMap MapOfIDAndWCSS8 = new HashMap<>(); + + HashMap MapOfIDAndCent9 = new HashMap<>(); + HashMap MapOfIDAndCount9 = new HashMap<>(); + HashMap MapOfIDAndWCSS9 = new HashMap<>(); + + HashMap MapOfIDAndCent10 = new HashMap<>(); + HashMap MapOfIDAndCount10 = new HashMap<>(); + HashMap MapOfIDAndWCSS10 = new HashMap<>(); + + HashMap MapOfIDAndCent11 = new HashMap<>(); + HashMap MapOfIDAndCount11 = new HashMap<>(); + HashMap MapOfIDAndWCSS11 = new HashMap<>(); + + HashMap MapOfIDAndCent12 = new HashMap<>(); + HashMap MapOfIDAndCount12 = new HashMap<>(); + HashMap MapOfIDAndWCSS12 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector.init(); + + // #create projector matrixs + Projector projector2 = so.getProjectionType(); + projector2.setOrigDim(so.getdim()); + projector2.setProjectedDim(so.getDimparameter()); + projector2.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector2.init(); + + // #create projector matrixs + Projector projector3 = so.getProjectionType(); + projector3.setOrigDim(so.getdim()); + projector3.setProjectedDim(so.getDimparameter()); + projector3.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector3.init(); + + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : data_in_round) + + + + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + + addtocounter(x, projector2, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + addtocounter(x, projector2, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector2, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector2, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + + addtocounter(x, projector3, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector3, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector3, MapOfIDAndCent11, MapOfIDAndCount11,ct++, rngvec11,MapOfIDAndWCSS11); + addtocounter(x, projector3, MapOfIDAndCent12, MapOfIDAndCount12,ct++, rngvec12,MapOfIDAndWCSS12); + + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_6 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount6.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount6.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount6.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_6.put(parent_id, 0L); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_6.remove(parent_id); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_7 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount7.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount7.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount7.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_7.put(parent_id, 0L); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_7.remove(parent_id); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_8 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount8.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount8.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount8.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_8.put(parent_id, 0L); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_8.remove(parent_id); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_9 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount9.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount9.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount9.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_9.put(parent_id, 0L); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_9.remove(parent_id); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_10 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount10.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount10.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount10.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_10.put(parent_id, 0L); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_10.remove(parent_id); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_11 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount11.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount11.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount11.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_11.put(parent_id, 0L); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_11.remove(parent_id); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_12 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount12.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount12.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount12.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_12.put(parent_id, 0L); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_12.remove(parent_id); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + } + } + } + } + } + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_6= new ArrayList<>(); + // sort and limit the list + stream2_6.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); + + + Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_7= new ArrayList<>(); + // sort and limit the list + stream2_7.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); + + + Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_8= new ArrayList<>(); + // sort and limit the list + stream2_8.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); + + + Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_9= new ArrayList<>(); + // sort and limit the list + stream2_9.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); + + + Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_10= new ArrayList<>(); + // sort and limit the list + stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); + + + Stream> stream2_11 = denseSetOfIDandCount2_11.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_11= new ArrayList<>(); + // sort and limit the list + stream2_11.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_11.add(x.getKey())); + + Stream> stream2_12 = denseSetOfIDandCount2_12.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_12= new ArrayList<>(); + // sort and limit the list + stream2_12.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_12.add(x.getKey())); + +// finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + num_of_clusters_stage1[0]= elbowcalculator.find_elbow(sortedIDList2_1); + num_of_clusters_stage1[1]= elbowcalculator.find_elbow(sortedIDList2_2); + num_of_clusters_stage1[2]= elbowcalculator.find_elbow(sortedIDList2_3); + num_of_clusters_stage1[3]= elbowcalculator.find_elbow(sortedIDList2_4); + num_of_clusters_stage1[4]= elbowcalculator.find_elbow(sortedIDList2_5); + num_of_clusters_stage1[5]= elbowcalculator.find_elbow(sortedIDList2_6); + num_of_clusters_stage1[6]= elbowcalculator.find_elbow(sortedIDList2_7); + num_of_clusters_stage1[7]= elbowcalculator.find_elbow(sortedIDList2_8); + num_of_clusters_stage1[8]= elbowcalculator.find_elbow(sortedIDList2_9); + num_of_clusters_stage1[9]= elbowcalculator.find_elbow(sortedIDList2_10); + num_of_clusters_stage1[10]= elbowcalculator.find_elbow(sortedIDList2_11); + num_of_clusters_stage1[11]= elbowcalculator.find_elbow(sortedIDList2_12); + + for (int i=0 ; i<12; i++) { + + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); +//// System.out.println("\n" + "No. of clusters_stage1 = " + num_of_clusters_stage1[i]); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); +//// System.out.println( "************************************************************" ); + sum_jt = sum_jt + num_of_clusters_stage1[i]; + } +//// System.out.println("\n" + "sum of No. of clusters_stage1 = " + sum_jt); + + double avg_clus_stage1 = Math.ceil(sum_jt / 12.0) ; + System.out.println("\n" + "Average of No. of clusters_stage1 = " + avg_clus_stage1 ); + System.out.println( "************************************************************" ); + // finding the range of K for offline clustering : + min_k = (int) (avg_clus_stage1 - 3) ; + + max_k = (int) (avg_clus_stage1 + 3) ; + + if (min_k < 3 ) { min_k = 3 ; max_k = 9; } ; + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; + float WCSS10 = 0; + float WCSS11 = 0; + float WCSS12 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + + for (Long keys: sortedIDList2_6){ + WCSS6 = WCSS6 + MapOfIDAndWCSS6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS7 = WCSS7 + MapOfIDAndWCSS7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS8 = WCSS8 + MapOfIDAndWCSS8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS9 = WCSS9 + MapOfIDAndWCSS9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} + + for (Long keys: sortedIDList2_11) + { WCSS11 = WCSS11 + MapOfIDAndWCSS11.get(keys);} + + for (Long keys: sortedIDList2_12) + { WCSS12 = WCSS12 + MapOfIDAndWCSS12.get(keys);} + + + + System.out.print(" wcss1 = " + WCSS1); + + System.out.print(" wcss2 = " + WCSS2); + + System.out.print(" wcss3 = " + WCSS3); + + System.out.print(" wcss4 = " + WCSS4); + + System.out.print(" wcss5 = " + WCSS5); + + System.out.print(" wcss6 = " + WCSS6); + + System.out.print(" wcss7 = " + WCSS7); + + System.out.print(" wcss8 = " + WCSS8); + + System.out.print(" wcss9 = " + WCSS9); + + System.out.print(" wcss10 = " + WCSS10); + + System.out.print(" wcss11 = " + WCSS11); + + System.out.print(" wcss12 = " + WCSS12); + +// float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + float arr[] = {WCSS1,WCSS2,WCSS3,WCSS4,WCSS5,WCSS6,WCSS7,WCSS8,WCSS9,WCSS10, WCSS11, WCSS12 }; + + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + if (index_of_max == 5) { + MapOfIDAndCount = MapOfIDAndCount6; + MapOfIDAndCent = MapOfIDAndCent6; + MapOfIDAndWCSS = MapOfIDAndWCSS6; + denseSetOfIDandCount2 = denseSetOfIDandCount2_6; + System.out.println("winner = tree6"); + } + if (index_of_max == 6) { + MapOfIDAndCount = MapOfIDAndCount7; + MapOfIDAndCent = MapOfIDAndCent7; + MapOfIDAndWCSS = MapOfIDAndWCSS7; + denseSetOfIDandCount2 = denseSetOfIDandCount2_7; + System.out.println("winner = tree7"); + } + if (index_of_max == 7) { + MapOfIDAndCount = MapOfIDAndCount8; + MapOfIDAndCent = MapOfIDAndCent8; + MapOfIDAndWCSS = MapOfIDAndWCSS8; + denseSetOfIDandCount2 = denseSetOfIDandCount2_8; + System.out.println("winner = tree8"); + } + if (index_of_max == 8) { + MapOfIDAndCount = MapOfIDAndCount9; + MapOfIDAndCent = MapOfIDAndCent9; + MapOfIDAndWCSS = MapOfIDAndWCSS9; + denseSetOfIDandCount2 = denseSetOfIDandCount2_9; + System.out.println("winner = tree9"); + } + if (index_of_max == 9) { + MapOfIDAndCount = MapOfIDAndCount10; + MapOfIDAndCent = MapOfIDAndCent10; + MapOfIDAndWCSS = MapOfIDAndWCSS10; + denseSetOfIDandCount2 = denseSetOfIDandCount2_10; + System.out.println("winner = tree10"); + } + if (index_of_max == 10) { + MapOfIDAndCount = MapOfIDAndCount11; + MapOfIDAndCent = MapOfIDAndCent11; + MapOfIDAndWCSS = MapOfIDAndWCSS11; + denseSetOfIDandCount2 = denseSetOfIDandCount2_11; + System.out.println("winner = tree11"); + } + if (index_of_max == 11) { + MapOfIDAndCount = MapOfIDAndCount12; + MapOfIDAndCent = MapOfIDAndCent12; + MapOfIDAndWCSS = MapOfIDAndWCSS12; + denseSetOfIDandCount2 = denseSetOfIDandCount2_12; + System.out.println("winner = tree12"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + return multimapWeightAndCent; + + +} + + + // this method gets the hashmap of ids and counts for the top n(cutoff) microclusters +// public static HashMap getmicroclusterIDandCount + + + + // this method gets the hashmap of ids and centroids for the top n(cutoff) microclusters + +// public static HashMap getmicroclusterIDandCents + + + + // this method gets the multihashmap of counts and centroids for the top n(cutoff) microclusters + + + public void run() { + + + List data_in_round = new ArrayList() ; + int count1=0; + int count2=0; + // List cents_aged = null ; /// may required to be initialized + // List cents_prev_round = null ; /// may required to be properly initialized + + boolean flag = true; // indicates first round if true else is false + Multimap WeightAndClusters = ArrayListMultimap.create() ; // null; + Multimap WeightAndClusters_prev = ArrayListMultimap.create() ; // null; + // Multimap WeightAndClusters_aged = ArrayListMultimap.create() ; // null; + + Listcentroids_prev = new ArrayList<>(); + List weights_prev =new ArrayList<>(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + Listcentroids3 = new ArrayList<>(); + List weights3 =new ArrayList<>(); + + //System.out.println(count2); + + for (float[] x : so.getRawData()) { + + count1 = count1+1; + //System.out.println(count1); + //System.out.println(element); + //data_in_round.add(so.getRawData().get(count1)); + + data_in_round.add(x); + count1 = count1+1; + count2 = count2 +1; + + //System.out.println(count2); + + if (count2 >= 2500) { + + System.out.println(count2); + + + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + rngvec6 = new float[so.getDimparameter()]; + rngvec7 = new float[so.getDimparameter()]; + rngvec8 = new float[so.getDimparameter()]; + rngvec9 = new float[so.getDimparameter()]; + rngvec10 = new float[so.getDimparameter()]; + rngvec11 = new float[so.getDimparameter()]; + rngvec12 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + Random r6 = new Random(); + Random r7 = new Random(); + Random r8 = new Random(); + Random r9 = new Random(); + Random r10 = new Random(); + Random r11 = new Random(); + Random r12 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec6[i] = (float) r6.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec7[i] = (float) r7.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec8[i] = (float) r8.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec9[i] = (float) r9.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec10[i] = (float) r10.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r11.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r12.nextGaussian(); + + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + WeightAndClusters = findDensityModes2(data_in_round); + + if ( flag == true) { + WeightAndClusters_prev = WeightAndClusters; + flag = false ; + } + + // WeightAndClusters_prev=WeightAndClusters_aged; + + // WeightAndClusters_aged.clear(); + + // System.out.println("multimap = "+ WeightAndClusters); + + // System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + // System.out.println("getRandomVector = "+ randVect); + // System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + weights2.add((float)weights); + } + System.out.println("curr_keys = "+ weights2); + + for (Long weight : WeightAndClusters.keySet()) + + { + centroids2.addAll(WeightAndClusters.get(weight)); + } + + + weights_prev.clear(); + + for (Long weights : WeightAndClusters_prev.keys()) + + { + float temp = (float) (0.25 * weights); + weights_prev.add((float)temp); + } + + System.out.println("kweighted_prev= "+ weights_prev); + + + centroids_prev.clear(); + for (Long weights : WeightAndClusters_prev.keySet()) + + { + centroids_prev.addAll(WeightAndClusters_prev.get(weights)); + } + + + + + for ( float w : weights_prev) + + { + weights2.add(w); + } + + System.out.println("keys_joined = "+ weights2); + + for (float[] c : centroids_prev) + + { + + centroids2.add(c); + + } + + // System.out.println("merged weights size = "+ weights2.size()); + // System.out.println("merged cents size = "+ centroids2.size()); + + // trim the weights2 and centroids2 to fix size : + // logic: select the top n weights and its index from weights2 , then select the centroids from those index in centroids2 + + // Collections.sort(weights2, Collections.reverseOrder()); + // weights2.sort(Comparator.reverseOrder()); + + + int[] sortedIndices = IntStream.range(0, weights2.size()) + .boxed().sorted((i, j) -> weights2.get(j).compareTo( weights2.get(i)) ) + .mapToInt(ele -> ele).toArray(); + System.out.println("sorted_index= "+ Arrays. toString(sortedIndices)); + + // create weights3 and centroid3 and then select the top 60 or cutoff elements. + + int limit=so.getCutoff() + 10; + + for (int i=0; i<= limit ;i++) + + { + int indx = sortedIndices[i] ; // check + + Float key_in_indx = weights2.get(indx); // weights2 is list of floats + + weights3.add( key_in_indx); + + float[] cent_in_indx = centroids2.get(indx); + + centroids3.add(cent_in_indx); + + } + + System.out.println("keys_joined = "+ weights3); + + System.out.println("size of weights3 = "+ weights3.size()); + + System.out.println("size of centroids3 = "+ centroids3.size()); + + // create a multimap and add the weights 3 and centriods3 and set it as agedmultimap + + // also create the multimap aged from this weights2 and cents2. + + Multimap WeightAndClusters_aged = ArrayListMultimap.create(); + + WeightAndClusters_aged.clear(); + + for (int i=0; i< weights3.size(); i++) + + { + WeightAndClusters_aged.put((weights3.get(i).longValue()), (float[]) (centroids3.get(i))); + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setRawData(centroids3); + aggloOffline2.setWeights(weights3); + + + List elbow_wcss = new ArrayList<>(); + + for (int k=min_k; k<=max_k ;k++) { + + aggloOffline2.setK(k); + List cent1 = aggloOffline2.getCentroids(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cent1, data_in_round)); + long tempu = (long) StatTests.WCSSECentroidsFloat(cent1, data_in_round); + elbow_wcss.add(tempu); + } + + + // finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + int num_of_clusters_stage2 = min_k + elbowcalculator.find_elbow(elbow_wcss); + + + System.out.println("\n" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx " ); + System.out.println("\n" + " No. of clusters_stage_2_Final = " + num_of_clusters_stage2 ); + System.out.println("\n" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx " ); + + System.out.println("\n" + "No. of Data Points = " + so.getRawData().size() ); + + + // final choice of centroids : ( repetative calculation , please optimize ) + aggloOffline2.setK(num_of_clusters_stage2); + + this.centroids = aggloOffline2.getCentroids(); + + count2=0; + data_in_round.clear(); + weights2.clear(); + centroids2.clear(); + + weights3.clear(); + centroids3.clear(); + + WeightAndClusters.clear(); + + WeightAndClusters_prev.clear(); + + WeightAndClusters_prev = WeightAndClusters_aged; + + } // end of the if loop + + + } // end of the for loop + + + } // end of run method + + + public static void main(String[] args) throws FileNotFoundException, + IOException , InterruptedException { + + + float avgtime = 0; + // System.out.printf("%f\t", f); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + + // "C:\Users\sayan\OneDrive - University of Cincinnati\Documents\downloaded\run_results\run_results\3runs\har_k6\1D.txt" + + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" ; + System.out.println(inputfile); + + data = VectorUtil.readFile( inputfile , raw); + + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data, dummyk); + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + + TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters rphit = new TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters(o); + long startTime = System.nanoTime(); + // rphit.getCentroids(); + rphit.run(); + +// avgtime += (System.nanoTime() - startTime) / 100000000; + + + System.gc(); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_static_stream.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_static_stream.java new file mode 100644 index 0000000..4273119 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_static_stream.java @@ -0,0 +1,1365 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 10 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_10runs_static_stream implements Clusterer, Runnable { + + boolean znorm = false; + int [] num_of_clusters_stage1 = new int[12]; + int min_k; + int max_k; + + // convert this to an array of arrays + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + private float[] rngvec6; + private float[] rngvec7; + private float[] rngvec8; + private float[] rngvec9; + private float[] rngvec10; + private float[] rngvec11; + private float[] rngvec12; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_10runs_static_stream(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + HashMap MapOfIDAndCent6 = new HashMap<>(); + HashMap MapOfIDAndCount6 = new HashMap<>(); + HashMap MapOfIDAndWCSS6 = new HashMap<>(); + + HashMap MapOfIDAndCent7 = new HashMap<>(); + HashMap MapOfIDAndCount7 = new HashMap<>(); + HashMap MapOfIDAndWCSS7 = new HashMap<>(); + + HashMap MapOfIDAndCent8 = new HashMap<>(); + HashMap MapOfIDAndCount8 = new HashMap<>(); + HashMap MapOfIDAndWCSS8 = new HashMap<>(); + + HashMap MapOfIDAndCent9 = new HashMap<>(); + HashMap MapOfIDAndCount9 = new HashMap<>(); + HashMap MapOfIDAndWCSS9 = new HashMap<>(); + + HashMap MapOfIDAndCent10 = new HashMap<>(); + HashMap MapOfIDAndCount10 = new HashMap<>(); + HashMap MapOfIDAndWCSS10 = new HashMap<>(); + + HashMap MapOfIDAndCent11 = new HashMap<>(); + HashMap MapOfIDAndCount11 = new HashMap<>(); + HashMap MapOfIDAndWCSS11 = new HashMap<>(); + + HashMap MapOfIDAndCent12 = new HashMap<>(); + HashMap MapOfIDAndCount12 = new HashMap<>(); + HashMap MapOfIDAndWCSS12 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector.init(); + + // #create projector matrixs + Projector projector2 = so.getProjectionType(); + projector2.setOrigDim(so.getdim()); + projector2.setProjectedDim(so.getDimparameter()); + projector2.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector2.init(); + + // #create projector matrixs + Projector projector3 = so.getProjectionType(); + projector3.setOrigDim(so.getdim()); + projector3.setProjectedDim(so.getDimparameter()); + projector3.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector3.init(); + + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + + addtocounter(x, projector2, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + addtocounter(x, projector2, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector2, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector2, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + + addtocounter(x, projector3, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector3, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector3, MapOfIDAndCent11, MapOfIDAndCount11,ct++, rngvec11,MapOfIDAndWCSS11); + addtocounter(x, projector3, MapOfIDAndCent12, MapOfIDAndCount12,ct++, rngvec12,MapOfIDAndWCSS12); + + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_6 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount6.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount6.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount6.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_6.put(parent_id, 0L); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_6.remove(parent_id); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_7 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount7.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount7.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount7.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_7.put(parent_id, 0L); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_7.remove(parent_id); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_8 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount8.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount8.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount8.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_8.put(parent_id, 0L); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_8.remove(parent_id); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_9 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount9.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount9.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount9.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_9.put(parent_id, 0L); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_9.remove(parent_id); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_10 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount10.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount10.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount10.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_10.put(parent_id, 0L); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_10.remove(parent_id); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_11 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount11.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount11.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount11.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_11.put(parent_id, 0L); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_11.remove(parent_id); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_12 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount12.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount12.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount12.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_12.put(parent_id, 0L); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_12.remove(parent_id); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + } + } + } + } + } + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_6= new ArrayList<>(); + // sort and limit the list + stream2_6.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); + + + Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_7= new ArrayList<>(); + // sort and limit the list + stream2_7.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); + + + Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_8= new ArrayList<>(); + // sort and limit the list + stream2_8.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); + + + Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_9= new ArrayList<>(); + // sort and limit the list + stream2_9.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); + + + Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_10= new ArrayList<>(); + // sort and limit the list + stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); + + + Stream> stream2_11 = denseSetOfIDandCount2_11.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_11= new ArrayList<>(); + // sort and limit the list + stream2_11.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_11.add(x.getKey())); + + Stream> stream2_12 = denseSetOfIDandCount2_12.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_12= new ArrayList<>(); + // sort and limit the list + stream2_12.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_12.add(x.getKey())); + +// finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + num_of_clusters_stage1[0]= elbowcalculator.find_elbow(sortedIDList2_1); + num_of_clusters_stage1[1]= elbowcalculator.find_elbow(sortedIDList2_2); + num_of_clusters_stage1[2]= elbowcalculator.find_elbow(sortedIDList2_3); + num_of_clusters_stage1[3]= elbowcalculator.find_elbow(sortedIDList2_4); + num_of_clusters_stage1[4]= elbowcalculator.find_elbow(sortedIDList2_5); + num_of_clusters_stage1[5]= elbowcalculator.find_elbow(sortedIDList2_6); + num_of_clusters_stage1[6]= elbowcalculator.find_elbow(sortedIDList2_7); + num_of_clusters_stage1[7]= elbowcalculator.find_elbow(sortedIDList2_8); + num_of_clusters_stage1[8]= elbowcalculator.find_elbow(sortedIDList2_9); + num_of_clusters_stage1[9]= elbowcalculator.find_elbow(sortedIDList2_10); + num_of_clusters_stage1[10]= elbowcalculator.find_elbow(sortedIDList2_11); + num_of_clusters_stage1[11]= elbowcalculator.find_elbow(sortedIDList2_12); + + for (int i=0 ; i<12; i++) { + + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); + System.out.println("\n" + "No. of clusters_stage1 = " + num_of_clusters_stage1[i]); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); + System.out.println( "************************************************************" ); + sum_jt = sum_jt + num_of_clusters_stage1[i]; + } + System.out.println("\n" + "sum of No. of clusters_stage1 = " + sum_jt); + + double avg_clus_stage1 = Math.ceil(sum_jt / 12.0) ; + System.out.println("\n" + "Average of No. of clusters_stage1 = " + avg_clus_stage1 ); + System.out.println( "************************************************************" ); + // finding the range of K for offline clustering : + min_k = (int) (avg_clus_stage1 - 3) ; + + max_k = (int) (avg_clus_stage1 + 3) ; + + if (min_k < 3 ) { min_k = 3 ; max_k = 9; } ; + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; + float WCSS10 = 0; + float WCSS11 = 0; + float WCSS12 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + + for (Long keys: sortedIDList2_6){ + WCSS6 = WCSS6 + MapOfIDAndWCSS6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS7 = WCSS7 + MapOfIDAndWCSS7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS8 = WCSS8 + MapOfIDAndWCSS8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS9 = WCSS9 + MapOfIDAndWCSS9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} + + for (Long keys: sortedIDList2_11) + { WCSS11 = WCSS11 + MapOfIDAndWCSS11.get(keys);} + + for (Long keys: sortedIDList2_12) + { WCSS12 = WCSS12 + MapOfIDAndWCSS12.get(keys);} + + + + System.out.print(" wcss1 = " + WCSS1); + + System.out.print(" wcss2 = " + WCSS2); + + System.out.print(" wcss3 = " + WCSS3); + + System.out.print(" wcss4 = " + WCSS4); + + System.out.print(" wcss5 = " + WCSS5); + + System.out.print(" wcss6 = " + WCSS6); + + System.out.print(" wcss7 = " + WCSS7); + + System.out.print(" wcss8 = " + WCSS8); + + System.out.print(" wcss9 = " + WCSS9); + + System.out.print(" wcss10 = " + WCSS10); + + System.out.print(" wcss11 = " + WCSS11); + + System.out.print(" wcss12 = " + WCSS12); + +// float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + float arr[] = {WCSS1,WCSS2,WCSS3,WCSS4,WCSS5,WCSS6,WCSS7,WCSS8,WCSS9,WCSS10, WCSS11, WCSS12 }; + + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + if (index_of_max == 5) { + MapOfIDAndCount = MapOfIDAndCount6; + MapOfIDAndCent = MapOfIDAndCent6; + MapOfIDAndWCSS = MapOfIDAndWCSS6; + denseSetOfIDandCount2 = denseSetOfIDandCount2_6; + System.out.println("winner = tree6"); + } + if (index_of_max == 6) { + MapOfIDAndCount = MapOfIDAndCount7; + MapOfIDAndCent = MapOfIDAndCent7; + MapOfIDAndWCSS = MapOfIDAndWCSS7; + denseSetOfIDandCount2 = denseSetOfIDandCount2_7; + System.out.println("winner = tree7"); + } + if (index_of_max == 7) { + MapOfIDAndCount = MapOfIDAndCount8; + MapOfIDAndCent = MapOfIDAndCent8; + MapOfIDAndWCSS = MapOfIDAndWCSS8; + denseSetOfIDandCount2 = denseSetOfIDandCount2_8; + System.out.println("winner = tree8"); + } + if (index_of_max == 8) { + MapOfIDAndCount = MapOfIDAndCount9; + MapOfIDAndCent = MapOfIDAndCent9; + MapOfIDAndWCSS = MapOfIDAndWCSS9; + denseSetOfIDandCount2 = denseSetOfIDandCount2_9; + System.out.println("winner = tree9"); + } + if (index_of_max == 9) { + MapOfIDAndCount = MapOfIDAndCount10; + MapOfIDAndCent = MapOfIDAndCent10; + MapOfIDAndWCSS = MapOfIDAndWCSS10; + denseSetOfIDandCount2 = denseSetOfIDandCount2_10; + System.out.println("winner = tree10"); + } + if (index_of_max == 10) { + MapOfIDAndCount = MapOfIDAndCount11; + MapOfIDAndCent = MapOfIDAndCent11; + MapOfIDAndWCSS = MapOfIDAndWCSS11; + denseSetOfIDandCount2 = denseSetOfIDandCount2_11; + System.out.println("winner = tree11"); + } + if (index_of_max == 11) { + MapOfIDAndCount = MapOfIDAndCount12; + MapOfIDAndCent = MapOfIDAndCent12; + MapOfIDAndWCSS = MapOfIDAndWCSS12; + denseSetOfIDandCount2 = denseSetOfIDandCount2_12; + System.out.println("winner = tree12"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + + rngvec6 = new float[so.getDimparameter()]; + rngvec7 = new float[so.getDimparameter()]; + rngvec8 = new float[so.getDimparameter()]; + rngvec9 = new float[so.getDimparameter()]; + rngvec10 = new float[so.getDimparameter()]; + rngvec11 = new float[so.getDimparameter()]; + rngvec12 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + Random r6 = new Random(); + Random r7 = new Random(); + Random r8 = new Random(); + Random r9 = new Random(); + Random r10 = new Random(); + Random r11 = new Random(); + Random r12 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec6[i] = (float) r6.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec7[i] = (float) r7.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec8[i] = (float) r8.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec9[i] = (float) r9.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec10[i] = (float) r10.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r11.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r12.nextGaussian(); + + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + + + List data1 = null; + data1 = so.getRawData(); + + List elbow_wcss = new ArrayList<>(); + + for (int k=min_k; k<=max_k ;k++) { + + aggloOffline2.setK(k); + List cent1 = aggloOffline2.getCentroids(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cent1, data1)); + long tempu = (long) StatTests.WCSSECentroidsFloat(cent1, data1); + elbow_wcss.add(tempu); + } + + + // finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + int num_of_clusters_stage2 = min_k + elbowcalculator.find_elbow(elbow_wcss); + + System.out.println("\n" + "No. of clusters_stage_2_Final = " + num_of_clusters_stage2 ); + + System.out.println("\n" + "No. of Data Points = " + so.getRawData().size() ); + + + // final choice of centroids : ( repetative calculation , please optimize ) + aggloOffline2.setK(num_of_clusters_stage2); + + this.centroids = aggloOffline2.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException , InterruptedException { + + // int k = 10;//6; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + // String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_mainfunc_1" ; + + // float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + List data_in_round = new ArrayList() ; + // "C:\Users\sayan\OneDrive - University of Cincinnati\Documents\downloaded\run_results\run_results\3runs\har_k6\1D.txt" + + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" ; + System.out.println(inputfile); + + data = VectorUtil.readFile( inputfile , raw); + int count1=0; + int count2=0; + boolean flag = true; + + for (float[] element : data) + + { + count1 = count1+1; + //System.out.println(count1); + //System.out.println(element); + data_in_round.add(data.get(count1-1)); + count2 = count2 +1; + + //System.out.println(count2); + + //if (count2 >= 1000) { + if (count2 == 10299) { + System.out.println(count2); + + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data_in_round, dummyk); + + + o.setDimparameter(16); + o.setCutoff(70); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_10runs_static_stream rphit = new TWRPv6_wcss_offline2_TEST2_10runs_static_stream(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +"_"+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data_in_round)); + System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + data_in_round.clear(); + count2=0; + centsr.clear(); + + } + + //System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + + + } + + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data_in_round, dummyk); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_10runs_static_stream rphit = new TWRPv6_wcss_offline2_TEST2_10runs_static_stream(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +"_"+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data_in_round)); + System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/aging/ageCentriods.java b/src/main/java/edu/uc/rphash/aging/ageCentriods.java index 665f5f8..fe5a601 100644 --- a/src/main/java/edu/uc/rphash/aging/ageCentriods.java +++ b/src/main/java/edu/uc/rphash/aging/ageCentriods.java @@ -17,7 +17,31 @@ public void run() { // TODO Auto-generated method stub } + + + public static float[][] weighted_merge(double cnt_1, float[] x_1, + double cnt_2, float[] x_2) { + + + cnt_1 = (float) cnt_1; + cnt_2 = (float) cnt_2; + + + float cnt_r = (float) (cnt_1 + cnt_2); + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (float) ((cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r); + + } + float[][] ret = new float[3][]; + ret[0] = new float[1]; + + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + } public static List> ageListOfcent( List> prev ) { @@ -40,4 +64,28 @@ public static List> ageListOfcent( List> prev ) { return prev; } + + + + + public static List ageListOfcents2( List prev , List curr) { + + + for (int i = 0; i < prev.size(); i++) + { + + double ageMultiplier= decay.ExpDecayFormula2 ( decayRate , i ); + List tempCents = (List) prev.get(i); + + for (int j =0 ; j counts ){ int cutoff = 0; // System.out.print("\n" + " size_of_list : " + size_of_list); - if(size_of_list >= 100){ - cutoff = 100; - } - if(size_of_list < 100){ - cutoff = size_of_list ; - } - - System.out.print("\n" + " cutoff : " + cutoff + "\n"); + // if(size_of_list >= 100){ + // cutoff = 100; + // } + // if(size_of_list < 100){ + // cutoff = size_of_list ; + // } + + cutoff =size_of_list; + //System.out.print("\n" + " cutoff : " + cutoff + "\n"); + //// System.out.print(" cutoff : " + cutoff + "\n"); List counts1 = counts; +//// System.out.print("\n" + " elbow values before smoothing : "+"\n" + counts1 + "\n"); double[][] elbowdata = new double[cutoff][2] ; @@ -224,9 +229,12 @@ public int find_elbow( List counts ){ // public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows) List list_of_elbows= new ArrayList<>(); -ArrayList elbows = run ( elbowdata, 1 , 1, false); +ArrayList elbows = run ( elbowdata, 1 , 0, false); + + + -System.out.print("\n" + "number of elbow points : " + elbows.size()); +//// System.out.print("\n" + "number of elbow points : " + elbows.size()); for (double[] point : elbows) { //System.out.print("\n" +"Knee point:" + Arrays.toString(point)); //System.out.println("\n" +"No. of clusters complement = " + point[1] ); @@ -339,7 +347,7 @@ public static void main(String[] args){ // public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows) - ArrayList elbows = elbowcalculator.run ( elbowdata3, 1 , 1, false); + ArrayList elbows = elbowcalculator.run ( elbowdata3, 1 , 0, false); System.out.print("\n" + "number of elbow points : " + elbows.size()); for (double[] point : elbows) {