diff --git a/.classpath b/.classpath
index 29836c6..90af4e4 100644
--- a/.classpath
+++ b/.classpath
@@ -1,9 +1,16 @@
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.pydevproject b/.pydevproject
new file mode 100644
index 0000000..98ee0df
--- /dev/null
+++ b/.pydevproject
@@ -0,0 +1,5 @@
+
+
+ Default
+ python interpreter
+
diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs
new file mode 100644
index 0000000..d1fb81f
--- /dev/null
+++ b/.settings/org.eclipse.core.resources.prefs
@@ -0,0 +1,2 @@
+eclipse.preferences.version=1
+encoding//src/main/java/edu/uc/rphash/tests/plotting.java=UTF-8
diff --git a/scripts/ari_test.py b/scripts/ari_test.py
new file mode 100644
index 0000000..4aef401
--- /dev/null
+++ b/scripts/ari_test.py
@@ -0,0 +1,64 @@
+import pandas as pd
+import numpy as np
+#from scipy.spatial import distance
+from math import dist
+import os
+import csv
+import openpyxl
+from sklearn.metrics.cluster import adjusted_rand_score
+
+# https://github.com/cran/dendextend/blob/master/R/find_k.R
+# https://cran.r-project.org/web/packages/fpc/fpc.pdf
+
+# scipy.spatial.distance.euclidean(A, B)
+# dist([1, 0, 0], [0, 1, 0])
+
+labels_true_gt=np.genfromtxt("C:/Users/dey.sn/Downloads/temp/haraal/haraal_labels_gt.csv", delimiter=',')
+print(labels_true_gt.shape[0])
+print(labels_true_gt)
+#column = nArr2D[:, 1]
+#output_labels = np.genfromtxt('C:/Users/dey.sn/Downloads/work/output/har_k6/Labels_har_k6_kmpp,cutoff,90,k6.csv', delimiter=',')
+'''
+output_labels_col1=output_labels[:,0]
+print(output_labels.shape[1])
+print(output_labels_col1)
+for cols in range(output_labels.shape[1]):
+ print(adjusted_rand_score(labels_true_gt,output_labels[:,cols]))
+
+'''
+# This is the path where you want to search
+path = r'C:/Users/dey.sn/Downloads/work/output/haraal_k6/'
+# this is the extension you want to detect
+extension = '.csv'
+substring="Labels"
+count=0
+wb=openpyxl.Workbook()
+sheet=wb.active
+sheet.title= 'haraal_ari'
+for root, dirs_list, files_list in os.walk(path):
+ for file_name in files_list:
+ if os.path.splitext(file_name)[-1] == extension:
+ file_name_path = os.path.join(root, file_name)
+ print(file_name)
+ print(file_name_path) # This is the full path of the filter file
+ try:
+ index=file_name.index(substring)
+ # print(index)
+ if(index==0):
+ count+=1
+ output_labels = np.genfromtxt(file_name_path, delimiter=',')
+ b = sheet.cell(row=count, column=2)
+ b.value = file_name
+ for cols in range(output_labels.shape[1]):
+ ari=adjusted_rand_score(labels_true_gt,output_labels[:,cols])
+ print(ari)
+ c = sheet.cell(row=count, column=(cols+12))
+ c.value = ari
+ except ValueError:
+ print(
+ "Not found!")
+ else:
+ print(
+ "Found!")
+print(count)
+wb.save("C:/Users/dey.sn/Downloads/work/output/haraal_k6/results_python_ari_all_runs.xlsx")
diff --git a/scripts/knee_test.py b/scripts/knee_test.py
new file mode 100644
index 0000000..f5d1b68
--- /dev/null
+++ b/scripts/knee_test.py
@@ -0,0 +1,350 @@
+import numpy as np
+from scipy import interpolate
+from scipy.signal import argrelextrema
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.linear_model import LinearRegression
+import warnings
+from typing import Tuple, Optional, Iterable
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+
+
+class KneeLocator(object):
+ def __init__(
+ self,
+ x: Iterable[float],
+ y: Iterable[float],
+ S: float = 1.0,
+ curve: str = "concave",
+ direction: str = "increasing",
+ interp_method: str = "interp1d",
+ online: bool = False,
+ ):
+ """
+ Once instantiated, this class attempts to find the point of maximum
+ curvature on a line. The knee is accessible via the `.knee` attribute.
+ :param x: x values.
+ :param y: y values.
+ :param S: Sensitivity, original paper suggests default of 1.0
+ :param curve: If 'concave', algorithm will detect knees. If 'convex', it
+ will detect elbows.
+ :param direction: one of {"increasing", "decreasing"}
+ :param interp_method: one of {"interp1d", "polynomial"}
+ :param online: Will correct old knee points if True, will return first knee if False
+ """
+ # Step 0: Raw Input
+ self.x = np.array(x)
+ self.y = np.array(y)
+ self.curve = curve
+ self.direction = direction
+ self.N = len(self.x)
+ self.S = S
+ self.all_knees = set()
+ self.all_norm_knees = set()
+ self.all_knees_y = []
+ self.all_norm_knees_y = []
+ self.online = online
+
+ # Step 1: fit a smooth line
+ if interp_method == "interp1d":
+ uspline = interpolate.interp1d(self.x, self.y)
+ self.Ds_y = uspline(self.x)
+ elif interp_method == "polynomial":
+ pn_model = PolynomialFeatures(7)
+ xpn = pn_model.fit_transform(self.x.reshape(-1, 1))
+ regr_model = LinearRegression()
+ regr_model.fit(xpn, self.y)
+ self.Ds_y = regr_model.predict(
+ pn_model.fit_transform(self.x.reshape(-1, 1))
+ )
+ else:
+ raise ValueError(
+ "{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format(
+ interp_method
+ )
+ )
+
+ # Step 2: normalize values
+ self.x_normalized = self.__normalize(self.x)
+ self.y_normalized = self.__normalize(self.Ds_y)
+
+ # Step 3: Calculate the Difference curve
+ self.x_normalized, self.y_normalized = self.transform_xy(
+ self.x_normalized, self.y_normalized, self.direction, self.curve
+ )
+ # normalized difference curve
+ self.y_difference = self.y_normalized - self.x_normalized
+ self.x_difference = self.x_normalized.copy()
+
+ # Step 4: Identify local maxima/minima
+ # local maxima
+ self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0]
+ self.x_difference_maxima = self.x_difference[self.maxima_indices]
+ self.y_difference_maxima = self.y_difference[self.maxima_indices]
+
+ # local minima
+ self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0]
+ self.x_difference_minima = self.x_difference[self.minima_indices]
+ self.y_difference_minima = self.y_difference[self.minima_indices]
+
+ # Step 5: Calculate thresholds
+ self.Tmx = self.y_difference_maxima - (
+ self.S * np.abs(np.diff(self.x_normalized).mean())
+ )
+
+ # Step 6: find knee
+ self.knee, self.norm_knee = self.find_knee()
+
+ # Step 7: If we have a knee, extract data about it
+ self.knee_y = self.norm_knee_y = None
+ if self.knee:
+ self.knee_y = self.y[self.x == self.knee][0]
+ self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0]
+
+ @staticmethod
+ def __normalize(a: Iterable[float]) -> Iterable[float]:
+ """normalize an array
+ :param a: The array to normalize
+ """
+ return (a - min(a)) / (max(a) - min(a))
+
+ @staticmethod
+ def transform_xy(
+ x: Iterable[float], y: Iterable[float], direction: str, curve: str
+ ) -> Tuple[Iterable[float], Iterable[float]]:
+ """transform x and y to concave, increasing based on given direction and curve"""
+ # convert elbows to knees
+ if curve == "convex":
+ x = x.max() - x
+ y = y.max() - y
+ # flip decreasing functions to increasing
+ if direction == "decreasing":
+ y = np.flip(y, axis=0)
+
+ if curve == "convex":
+ x = np.flip(x, axis=0)
+ y = np.flip(y, axis=0)
+
+ return x, y
+
+ def find_knee(self,):
+ """This function finds and sets the knee value and the normalized knee value. """
+ if not self.maxima_indices.size:
+ warnings.warn(
+ "No local maxima found in the difference curve\n"
+ "The line is probably not polynomial, try plotting\n"
+ "the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n"
+ "Also check that you aren't mistakenly setting the curve argument",
+ RuntimeWarning,
+ )
+ return None, None
+
+ # placeholder for which threshold region i is located in.
+ maxima_threshold_index = 0
+ minima_threshold_index = 0
+ # traverse the difference curve
+ for i, x in enumerate(self.x_difference):
+ # skip points on the curve before the the first local maxima
+ if i < self.maxima_indices[0]:
+ continue
+
+ j = i + 1
+
+ # reached the end of the curve
+ if x == 1.0:
+ break
+
+ # if we're at a local max, increment the maxima threshold index and continue
+ if (self.maxima_indices == i).any():
+ threshold = self.Tmx[maxima_threshold_index]
+ threshold_index = i
+ maxima_threshold_index += 1
+ # values in difference curve are at or after a local minimum
+ if (self.minima_indices == i).any():
+ threshold = 0.0
+ minima_threshold_index += 1
+
+ if self.y_difference[j] < threshold:
+ if self.curve == "convex":
+ if self.direction == "decreasing":
+ knee = self.x[threshold_index]
+ norm_knee = self.x_normalized[threshold_index]
+ else:
+ knee = self.x[-(threshold_index + 1)]
+ norm_knee = self.x_normalized[-(threshold_index + 1)]
+
+ elif self.curve == "concave":
+ if self.direction == "decreasing":
+ knee = self.x[-(threshold_index + 1)]
+ norm_knee = self.x_normalized[-(threshold_index + 1)]
+ else:
+ knee = self.x[threshold_index]
+ norm_knee = self.x_normalized[threshold_index]
+
+ # add the y value at the knee
+ y_at_knee = self.y[self.x == knee][0]
+ y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0]
+ if knee not in self.all_knees:
+ self.all_knees_y.append(y_at_knee)
+ self.all_norm_knees_y.append(y_norm_at_knee)
+
+ # now add the knee
+ self.all_knees.add(knee)
+ self.all_norm_knees.add(norm_knee)
+
+ # if detecting in offline mode, return the first knee found
+ if self.online is False:
+ return knee, norm_knee
+
+ if self.all_knees == set():
+ warnings.warn("No knee/elbow found")
+ return None, None
+
+ return knee, norm_knee
+
+ def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None):
+ """Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists.
+
+ :param figsize: Optional[Tuple[int, int]
+ The figure size of the plot. Example (12, 8)
+ :return: NoReturn
+ """
+ import matplotlib.pyplot as plt
+
+ if figsize is None:
+ figsize = (6, 6)
+
+ plt.figure(figsize=figsize)
+ plt.title("Normalized Knee Point")
+ plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve")
+ plt.plot(self.x_difference, self.y_difference, "r", label="difference curve")
+ plt.xticks(
+ np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1)
+ )
+ plt.yticks(
+ np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1)
+ )
+
+ plt.vlines(
+ self.norm_knee,
+ plt.ylim()[0],
+ plt.ylim()[1],
+ linestyles="--",
+ label="knee/elbow",
+ )
+ plt.legend(loc="best")
+
+ def plot_knee(self, figsize: Optional[Tuple[int, int]] = None):
+ """
+ Plot the curve and the knee, if it exists
+
+ :param figsize: Optional[Tuple[int, int]
+ The figure size of the plot. Example (12, 8)
+ :return: NoReturn
+ """
+ import matplotlib.pyplot as plt
+
+ if figsize is None:
+ figsize = (6, 6)
+
+ plt.figure(figsize=figsize)
+ plt.title("Knee Point")
+ plt.plot(self.x, self.y, "b", label="data")
+ plt.vlines(
+ self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow"
+ )
+ plt.legend(loc="best")
+
+ # Niceties for users working with elbows rather than knees
+ @property
+ def elbow(self):
+ return self.knee
+
+ @property
+ def norm_elbow(self):
+ return self.norm_knee
+
+ @property
+ def elbow_y(self):
+ return self.knee_y
+
+ @property
+ def norm_elbow_y(self):
+ return self.norm_knee_y
+
+ @property
+ def all_elbows(self):
+ return self.all_knees
+
+ @property
+ def all_norm_elbows(self):
+ return self.all_norm_knees
+
+ @property
+ def all_elbows_y(self):
+ return self.all_knees_y
+
+ @property
+ def all_norm_elbows_y(self):
+ return self.all_norm_knees_y
+
+
+## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+
+
+import pandas as pd
+import timeit
+
+#df=pd.read_excel("C:/Users/dey.sn/Downloads/work/output/elbow_graph_stage1_syn_data.xlsx")
+#df=pd.read_excel("data.xlsx", sheet_name='har2', header=None, na_values=['NA'], usecols="Aq,at",skiprows=range(97),nrows=6)
+df=pd.read_excel("C:/Users/dey.sn/Downloads/work/output/elbow_graph_stage1_syn_data.xlsx", sheet_name='N5%_1000', header=None, na_values=['NA'], usecols="A,y",skiprows=range(3),nrows=99)
+#print(df)
+conv_arr= df.values
+start = timeit.default_timer()
+
+#split matrix into 3 columns each into 1d array
+#print(conv_arr.shape)
+#print(conv_arr[1,1])
+arr1 = np.delete(conv_arr,1,axis=1)
+arr2 = np.delete(conv_arr,0,axis=1)
+
+#converting into 1D array
+x = arr1.ravel()
+y = arr2.ravel()
+
+kn = KneeLocator(list(x), y , S=0.0, curve='convex', direction='decreasing',online=False ) #,interp_method='polynomial')
+stop = timeit.default_timer()
+print('Time: ', stop - start)
+kn2 = KneeLocator(list(x), y , S=1.0, curve='convex', direction='decreasing',online=False )
+print(kn.knee)
+print(kn2.knee)
+#print(kn.norm_knee)
+
+plt.style.use('ggplot')
+plt.plot()
+plt.xlabel('K (no. of clusters) ')
+plt.ylabel('WCSSE')
+#plt.title('Elbow method for optimal k.[data=HAR, k=4, Pred. k= %d]' %(kn.knee))
+plt.suptitle('Elbow Method For Optimal Cluster Determination [data=Noise_30_percent, K=10, Pred.K = %d]' %(kn.knee),x=0.5, y=0.000, ha="center" , va="bottom")
+plt.plot(x, y, 'bx-')
+#plt.xscale('log')
+plt.grid(True)
+plt.xticks()
+plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
+plt.savefig("C:/Users/dey.sn/Downloads/work/output/N30%_1000_graph_s0.pdf")
+plt.show()
+
+plt.style.use('ggplot')
+plt.plot()
+plt.xlabel('Buckets')
+plt.ylabel('Counts')
+plt.title('Elbow method for optimal k. [data=Noise_30_percent, K=10, Pred.K = %d]' %(kn2.knee))
+plt.plot(x, y, 'bx-')
+#plt.xscale('log')
+plt.grid(True)
+plt.xticks()
+plt.vlines(kn2.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
+plt.savefig("C:/Users/dey.sn/Downloads/work/output/N30%_1000_graph_s1.pdf")
+plt.show()
\ No newline at end of file
diff --git a/scripts/measures_wcss.py b/scripts/measures_wcss.py
new file mode 100644
index 0000000..17d4bdb
--- /dev/null
+++ b/scripts/measures_wcss.py
@@ -0,0 +1,87 @@
+import pandas as pd
+import numpy as np
+#from scipy.spatial import distance
+from math import dist
+import os
+import csv
+import openpyxl
+from sklearn.metrics.cluster import adjusted_rand_score
+
+# https://github.com/cran/dendextend/blob/master/R/find_k.R
+# https://cran.r-project.org/web/packages/fpc/fpc.pdf
+
+# scipy.spatial.distance.euclidean(A, B)
+# dist([1, 0, 0], [0, 1, 0])
+
+data=np.genfromtxt("C:/Users/dey.sn/Downloads/temp/haraal/2d.csv", delimiter=',')
+print(data.shape[0])
+#print(data[10298])
+vectors=data.shape[0]
+
+# This is the path where you want to search
+path = r'C:/Users/dey.sn/Downloads/work/output/haraal_k6/'
+# this is the extension you want to detect
+extension = '.csv'
+substring="haraal_k6"
+count=0
+wb=openpyxl.Workbook()
+sheet=wb.active
+sheet.title= 'haraal'
+for root, dirs_list, files_list in os.walk(path):
+ for file_name in files_list:
+ if os.path.splitext(file_name)[-1] == extension:
+ file_name_path = os.path.join(root, file_name)
+ print(file_name)
+ print(file_name_path) # This is the full path of the filter file
+ try:
+ index=file_name.index(substring)
+ # print(index)
+ if(index==0):
+ count+=1
+ centarr = np.genfromtxt(file_name_path, delimiter=',')
+ b = sheet.cell(row=count, column=2)
+ b.value = file_name
+# centarr = np.genfromtxt('C:/Users/dey.sn/Downloads/work/output/har_k6/har_k6_kmeans_120cutoff _4_2.csv', delimiter=',')
+# print(np.shape(centarr))
+# print(centarr[0],centarr[1])
+ index = 2
+ row=int(centarr[0]) # number of centroids
+ col=int(centarr[1])
+ cents=[]
+ for i in range(row):
+ c1=[]
+ for j in range(col):
+ c1.append(centarr[index])
+ index += 1
+ cents.append(c1)
+
+# print(cents[2])
+# print(np.shape(cents))
+
+ wcss1=0
+ for i in range (vectors):
+ distance1 = []
+ for j in range(row):
+# print(j)
+ d1=(dist(data[i], cents[j]))
+ #print(d1)
+ distance1.append(d1)
+
+ print(distance1)
+ mindist=min(distance1)
+ print(mindist)
+
+ wcss1= int(wcss1 + (mindist*mindist))
+
+ print("wcss1 is : " , (wcss1))
+
+ c = sheet.cell(row=count, column=12)
+ c.value = wcss1
+ except ValueError:
+ print
+ "Not found!"
+ else:
+ print
+ "Found!"
+print(count)
+wb.save("C:/Users/dey.sn/Downloads/work/output/haraal_k6/results_python_wcss_all_runs.xlsx")
diff --git a/src/main/java/edu/uc/rphash/Centroid.java b/src/main/java/edu/uc/rphash/Centroid.java
index 8ffaa6d..20fc768 100644
--- a/src/main/java/edu/uc/rphash/Centroid.java
+++ b/src/main/java/edu/uc/rphash/Centroid.java
@@ -1,6 +1,7 @@
package edu.uc.rphash;
import java.util.ArrayList;
+import java.util.List;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.stream.Collector;
import java.util.stream.Collectors;
@@ -233,5 +234,21 @@ public int compareTo(Centroid o) {
return (int) (o.id - this.id);
}
+
+
+
+
+ public static void removeallobjects(List DB) {
+ // float[] tmp;
+ for (int i = 0; i < DB.size(); i++) {
+ //tmp = DB.get(i).centroid();
+ DB.remove(i);
+
+ }
+
+
+ }
+
+
}
diff --git a/src/main/java/edu/uc/rphash/Dis_PPAHStream.java b/src/main/java/edu/uc/rphash/Dis_PPAHStream.java
new file mode 100644
index 0000000..7630571
--- /dev/null
+++ b/src/main/java/edu/uc/rphash/Dis_PPAHStream.java
@@ -0,0 +1,371 @@
+package edu.uc.rphash;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentSkipListSet;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.Future;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+import edu.uc.rphash.Readers.RPHashObject;
+import edu.uc.rphash.Readers.SimpleArrayReader;
+import edu.uc.rphash.decoders.Decoder;
+import edu.uc.rphash.decoders.Spherical;
+import edu.uc.rphash.frequentItemSet.ItemSet;
+import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet;
+import edu.uc.rphash.lsh.LSH;
+//import edu.uc.rphash.projections.DBFriendlyProjection;
+import edu.uc.rphash.projections.Projector;
+import edu.uc.rphash.standardhash.HashAlgorithm;
+import edu.uc.rphash.standardhash.NoHash;
+import edu.uc.rphash.tests.clusterers.KMeans2;
+import edu.uc.rphash.tests.generators.GenerateData;
+import edu.uc.rphash.util.VectorUtil;
+
+public class Dis_PPAHStream implements Clusterer {
+ // float variance;
+
+ public ItemSet is;
+
+ List labels;
+ HashMap labelmap;
+
+ private int processors = 1;
+
+ public static long mapfunc(float[] vec, LSH lshfunc) {
+
+ return lshfunc.lshHash(vec);
+
+ }
+
+ public RPHashObject mapreduce1() {
+
+ //------------This is Setup Code-------------
+ // create our LSH Machine
+ HashAlgorithm hal = new NoHash(so.getHashmod());
+ Iterator vecs = so.getVectorIterator();
+ if (!vecs.hasNext())
+ return so;
+
+ int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and
+ // round to
+ // integer
+ int k = so.getk() * logk;
+ is = new SimpleFrequentItemSet(k);
+ Decoder dec = so.getDecoderType();
+ dec.setCounter(is);
+
+ Projector p = so.getProjectionType();
+ p.setOrigDim(so.getdim());
+ p.setProjectedDim(dec.getDimensionality());
+ p.setRandomSeed(so.getRandomSeed());
+ p.init();
+ // no noise to start with
+ List noise = LSH.genNoiseTable(
+ dec.getDimensionality(),
+ so.getNumBlur(),
+ new Random(),
+ dec.getErrorRadius()
+ / (dec.getDimensionality() * dec.getDimensionality()));
+
+ LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize());
+
+ // add to frequent itemset the hashed Decoded randomly projected vector
+
+
+ List dat = so.getRawData();
+
+ //Dey
+ //-------------------------
+ //------------This is the actual map function-------------
+
+ //this is the actual map
+ ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors );
+ try {
+ forkJoinPool.submit(() ->
+ dat.parallelStream().map(s->mapfunc(s,lshfunc)).forEach(s->is.add(s))
+ ).get();
+ } catch (ExecutionException|InterruptedException e) {
+ e.printStackTrace();
+ }
+ forkJoinPool.shutdown();
+
+ //-------------------------
+
+
+ //------------This is clean up code-------------
+ List topids = is.getTop();
+ so.setPreviousTopID(topids);
+
+ List topsizes = is.getCounts();
+
+
+ // this is where the parallel reduce function would be
+ // to sum up the counts that correspond to hash_ids
+ // so very much the word count example
+ List countsAsFloats = new ArrayList();
+ for (long ct : topsizes)
+ countsAsFloats.add((float) ct);
+ so.setCounts(countsAsFloats);
+ return so;
+ }
+
+ public static void redFunc(float[] vec, LSH lshfunc, List noise,
+ List labels, List centroids) {
+ long[] hash = lshfunc.lshHashRadius(vec, noise);
+ labels.add(-1l);
+ // radius probe around the vector
+ for (Centroid cent : centroids) {
+ for (long h : hash) {
+ if (cent.ids.contains(h)) {
+ cent.updateVec(vec);
+ labels.set(labels.size() - 1, cent.id);
+ }
+ }
+ }
+ }
+
+ public static long[] redFunc(float[] vec, LSH lshfunc, List noise) {
+ return lshfunc.lshHashRadius(vec, noise);
+ }
+
+ /*
+ * This is the second phase after the top ids have been in the reduce phase
+ * aggregated
+ */
+ public RPHashObject mapreduce2() {
+
+ //------------This is Setup Code-------------
+ Iterator vecs = so.getVectorIterator();
+ if (!vecs.hasNext())
+ return so;
+ float[] vec = vecs.next();
+
+ HashAlgorithm hal = new NoHash(so.getHashmod());
+ Decoder dec = so.getDecoderType();
+
+ Projector p = so.getProjectionType();
+ p.setOrigDim(so.getdim());
+ p.setProjectedDim(dec.getDimensionality());
+ p.setRandomSeed(so.getRandomSeed());
+ p.init();
+
+ List noise = LSH.genNoiseTable(
+ so.getdim(),
+ so.getNumBlur(),
+ new Random(so.getRandomSeed()),
+ (float) (dec.getErrorRadius())
+ / (float) (dec.getDimensionality() * dec
+ .getDimensionality()));
+
+ LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize());
+ ArrayList centroids = new ArrayList();
+
+ for (long id : so.getPreviousTopID()) {
+ centroids.add(new Centroid(so.getdim(), id, -1));
+ }
+
+ //DEY
+ //-------------------------------------------------
+ //------------This is the parallel map-------------
+
+ List dat = so.getRawData();
+
+
+ ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors );
+ try {
+ //parallel map
+ forkJoinPool.submit(() ->
+ dat.parallelStream().map(s->redFunc(s,lshfunc,noise)).forEach(hashes -> {
+ //end parallel map
+
+ //parallel reduce
+ //local centroids is what would need to be implemented
+ // to update in parallel in each node
+ // currently this thing shares the centroids list, which is a bottleneck
+ // the reducer would need to use this to reduce centroids with the same id
+ // Centroid.merge(ctcent1, cent1,wcsscent1,ctcent2, cent2,wcsscent2);
+// List localcentroids = centroids.stream().map(Centroid::new).collect(Centroid.toArrayList());
+ for (Centroid cent : centroids) {
+ for (long h : hashes)
+ {
+ if (cent.ids.contains(h))
+ {
+ cent.updateVec(vec);
+ }
+ }
+ }
+ })).get();
+ } catch (InterruptedException|ExecutionException e) {
+ e.printStackTrace();
+ }
+
+ forkJoinPool.shutdown();
+ //-------------------------------------------------
+
+ //------------This is the cleanup code-------------
+ //Sequential
+
+ Clusterer offlineclusterer = new KMeans2();//so.getOfflineClusterer();
+ offlineclusterer.setData(centroids.stream().collect(Centroid.toArrayList()));
+ offlineclusterer.setWeights(so.getCounts());
+ offlineclusterer.setK(so.getk());
+
+// this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids);
+ so.setCentroids(offlineclusterer.getCentroids());
+ return so;
+ }
+
+ // 271458
+ // 264779.7
+
+ public List getLabels() {
+ for (int i = 0; i < labels.size(); i++) {
+ if (labelmap.containsKey(labels.get(i))) {
+ labels.set(i, labelmap.get(labels.get(i)));
+ } else {
+ labels.set(i, -1l);
+ }
+ }
+ return this.labels;
+ }
+
+ private List centroids = null;
+ private RPHashObject so;
+
+ public Dis_PPAHStream(List data, int k) {
+ so = new SimpleArrayReader(data, k);
+ }
+
+// int threads = 1;
+
+ public Dis_PPAHStream(List data, int k, int processors) {
+
+ this.processors = processors;
+ so = new SimpleArrayReader(data, k);
+ so.setParallel(true);
+ }
+
+ public Dis_PPAHStream(List data, int k, int times, int rseed) {
+ so = new SimpleArrayReader(data, k);
+ }
+
+ public Dis_PPAHStream(RPHashObject so) {
+ this.so = so;
+ }
+
+ public List getCentroids(RPHashObject so) {
+ this.so = so;
+ if (centroids == null)
+ run();
+ return centroids;
+ }
+
+ @Override
+ public List getCentroids() {
+ if (centroids == null)
+ run();
+
+ return centroids;
+ }
+
+ private void run() {
+ mapreduce1();
+ mapreduce2();
+ //this.centroids = so.getCentroids();
+ }
+
+ public static void main(String[] args) {
+ int k = 10;
+ int d = 1000;
+ int n = 10000;
+ float var = 1f;
+ int count = 5;
+ System.out.printf("Decoder: %s\n", "Sphere");
+ System.out.printf("ClusterVar\t");
+ for (int i = 0; i < count; i++)
+ System.out.printf("Trial%d\t", i);
+ System.out.printf("RealWCSS\n");
+
+ for (float f = var; f < 3.01; f += .05f) {
+ float avgrealwcss = 0;
+ float avgtime = 0;
+ System.out.printf("%f\t", f);
+ for (int i = 0; i < count; i++) {
+ GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f);
+ RPHashObject o = new SimpleArrayReader(gen.data, k);
+ Dis_PPAHStream rphit = new Dis_PPAHStream(o);
+// rphit.threads = 4;
+ o.setDecoderType(new Spherical(32, 4, 1));
+ // o.setDimparameter(31);
+ o.setOfflineClusterer(new KMeans2());
+ long startTime = System.nanoTime();
+ List centsr = rphit.getCentroids();
+ avgtime += (System.nanoTime() - startTime) / 100000000;
+
+ // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),
+ // gen.getData());
+
+ // System.out.printf("%.0f\t",
+ // StatTests.WCSSECentroidsFloat(centsr, gen.data));
+ // System.gc();
+
+ }
+ System.out.printf("%.0f\n", avgrealwcss / count);
+ }
+ }
+
+ @Override
+ public RPHashObject getParam() {
+ return so;
+ }
+
+ @Override
+ public void setWeights(List counts) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void setData(List centroids) {
+ this.centroids = centroids;
+
+ }
+
+ @Override
+ public void setRawData(List centroids) {
+ if (this.centroids == null)
+ this.centroids = new ArrayList<>(centroids.size());
+ for (float[] f : centroids) {
+ this.centroids.add(new Centroid(f, 0));
+ }
+ }
+
+ @Override
+ public void setK(int getk) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void reset(int randomseed) {
+ centroids = null;
+ so.setRandomSeed(randomseed);
+ }
+
+ @Override
+ public boolean setMultiRun(int runs) {
+ return true;
+ }
+}
diff --git a/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java b/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java
new file mode 100644
index 0000000..75614a0
--- /dev/null
+++ b/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java
@@ -0,0 +1,371 @@
+package edu.uc.rphash;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentSkipListSet;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.Future;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+import edu.uc.rphash.Readers.RPHashObject;
+import edu.uc.rphash.Readers.SimpleArrayReader;
+import edu.uc.rphash.decoders.Decoder;
+import edu.uc.rphash.decoders.Spherical;
+import edu.uc.rphash.frequentItemSet.ItemSet;
+import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet;
+import edu.uc.rphash.lsh.LSH;
+//import edu.uc.rphash.projections.DBFriendlyProjection;
+import edu.uc.rphash.projections.Projector;
+import edu.uc.rphash.standardhash.HashAlgorithm;
+import edu.uc.rphash.standardhash.NoHash;
+import edu.uc.rphash.tests.clusterers.KMeans2;
+import edu.uc.rphash.tests.generators.GenerateData;
+import edu.uc.rphash.util.VectorUtil;
+
+public class Dis_PRPHashStream implements Clusterer {
+ // float variance;
+
+ public ItemSet is;
+
+ List labels;
+ HashMap labelmap;
+
+ private int processors = 1;
+
+ public static long mapfunc(float[] vec, LSH lshfunc) {
+
+ return lshfunc.lshHash(vec);
+
+ }
+
+ public RPHashObject mapreduce1() {
+
+ //------------This is Setup Code-------------
+ // create our LSH Machine
+ HashAlgorithm hal = new NoHash(so.getHashmod());
+ Iterator vecs = so.getVectorIterator();
+ if (!vecs.hasNext())
+ return so;
+
+ int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and
+ // round to
+ // integer
+ int k = so.getk() * logk;
+ is = new SimpleFrequentItemSet(k);
+ Decoder dec = so.getDecoderType();
+ dec.setCounter(is);
+
+ Projector p = so.getProjectionType();
+ p.setOrigDim(so.getdim());
+ p.setProjectedDim(dec.getDimensionality());
+ p.setRandomSeed(so.getRandomSeed());
+ p.init();
+ // no noise to start with
+ List noise = LSH.genNoiseTable(
+ dec.getDimensionality(),
+ so.getNumBlur(),
+ new Random(),
+ dec.getErrorRadius()
+ / (dec.getDimensionality() * dec.getDimensionality()));
+
+ LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize());
+
+ // add to frequent itemset the hashed Decoded randomly projected vector
+
+
+ List dat = so.getRawData();
+
+ //Dey
+ //-------------------------
+ //------------This is the actual map function-------------
+
+ //this is the actual map
+ ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors );
+ try {
+ forkJoinPool.submit(() ->
+ dat.parallelStream().map(s->mapfunc(s,lshfunc)).forEach(s->is.add(s))
+ ).get();
+ } catch (ExecutionException|InterruptedException e) {
+ e.printStackTrace();
+ }
+ forkJoinPool.shutdown();
+
+ //-------------------------
+
+
+ //------------This is clean up code-------------
+ List topids = is.getTop();
+ so.setPreviousTopID(topids);
+
+ List topsizes = is.getCounts();
+
+
+ // this is where the parallel reduce function would be
+ // to sum up the counts that correspond to hash_ids
+ // so very much the word count example
+ List countsAsFloats = new ArrayList();
+ for (long ct : topsizes)
+ countsAsFloats.add((float) ct);
+ so.setCounts(countsAsFloats);
+ return so;
+ }
+
+ public static void redFunc(float[] vec, LSH lshfunc, List noise,
+ List labels, List centroids) {
+ long[] hash = lshfunc.lshHashRadius(vec, noise);
+ labels.add(-1l);
+ // radius probe around the vector
+ for (Centroid cent : centroids) {
+ for (long h : hash) {
+ if (cent.ids.contains(h)) {
+ cent.updateVec(vec);
+ labels.set(labels.size() - 1, cent.id);
+ }
+ }
+ }
+ }
+
+ public static long[] redFunc(float[] vec, LSH lshfunc, List noise) {
+ return lshfunc.lshHashRadius(vec, noise);
+ }
+
+ /*
+ * This is the second phase after the top ids have been in the reduce phase
+ * aggregated
+ */
+ public RPHashObject mapreduce2() {
+
+ //------------This is Setup Code-------------
+ Iterator vecs = so.getVectorIterator();
+ if (!vecs.hasNext())
+ return so;
+ float[] vec = vecs.next();
+
+ HashAlgorithm hal = new NoHash(so.getHashmod());
+ Decoder dec = so.getDecoderType();
+
+ Projector p = so.getProjectionType();
+ p.setOrigDim(so.getdim());
+ p.setProjectedDim(dec.getDimensionality());
+ p.setRandomSeed(so.getRandomSeed());
+ p.init();
+
+ List noise = LSH.genNoiseTable(
+ so.getdim(),
+ so.getNumBlur(),
+ new Random(so.getRandomSeed()),
+ (float) (dec.getErrorRadius())
+ / (float) (dec.getDimensionality() * dec
+ .getDimensionality()));
+
+ LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize());
+ ArrayList centroids = new ArrayList();
+
+ for (long id : so.getPreviousTopID()) {
+ centroids.add(new Centroid(so.getdim(), id, -1));
+ }
+
+ //DEY
+ //-------------------------------------------------
+ //------------This is the parallel map-------------
+
+ List dat = so.getRawData();
+
+
+ ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors );
+ try {
+ //parallel map
+ forkJoinPool.submit(() ->
+ dat.parallelStream().map(s->redFunc(s,lshfunc,noise)).forEach(hashes -> {
+ //end parallel map
+
+ //parallel reduce
+ //local centroids is what would need to be implemented
+ // to update in parallel in each node
+ // currently this thing shares the centroids list, which is a bottleneck
+ // the reducer would need to use this to reduce centroids with the same id
+ // Centroid.merge(ctcent1, cent1,wcsscent1,ctcent2, cent2,wcsscent2);
+// List localcentroids = centroids.stream().map(Centroid::new).collect(Centroid.toArrayList());
+ for (Centroid cent : centroids) {
+ for (long h : hashes)
+ {
+ if (cent.ids.contains(h))
+ {
+ cent.updateVec(vec);
+ }
+ }
+ }
+ })).get();
+ } catch (InterruptedException|ExecutionException e) {
+ e.printStackTrace();
+ }
+
+ forkJoinPool.shutdown();
+ //-------------------------------------------------
+
+ //------------This is the cleanup code-------------
+ //Sequential
+
+ Clusterer offlineclusterer = new KMeans2();//so.getOfflineClusterer();
+ offlineclusterer.setData(centroids.stream().collect(Centroid.toArrayList()));
+ offlineclusterer.setWeights(so.getCounts());
+ offlineclusterer.setK(so.getk());
+
+// this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids);
+ so.setCentroids(offlineclusterer.getCentroids());
+ return so;
+ }
+
+ // 271458
+ // 264779.7
+
+ public List getLabels() {
+ for (int i = 0; i < labels.size(); i++) {
+ if (labelmap.containsKey(labels.get(i))) {
+ labels.set(i, labelmap.get(labels.get(i)));
+ } else {
+ labels.set(i, -1l);
+ }
+ }
+ return this.labels;
+ }
+
+ private List centroids = null;
+ private RPHashObject so;
+
+ public Dis_PRPHashStream(List data, int k) {
+ so = new SimpleArrayReader(data, k);
+ }
+
+// int threads = 1;
+
+ public Dis_PRPHashStream(List data, int k, int processors) {
+
+ this.processors = processors;
+ so = new SimpleArrayReader(data, k);
+ so.setParallel(true);
+ }
+
+ public Dis_PRPHashStream(List data, int k, int times, int rseed) {
+ so = new SimpleArrayReader(data, k);
+ }
+
+ public Dis_PRPHashStream(RPHashObject so) {
+ this.so = so;
+ }
+
+ public List getCentroids(RPHashObject so) {
+ this.so = so;
+ if (centroids == null)
+ run();
+ return centroids;
+ }
+
+ @Override
+ public List getCentroids() {
+ if (centroids == null)
+ run();
+
+ return centroids;
+ }
+
+ private void run() {
+ mapreduce1();
+ mapreduce2();
+ //this.centroids = so.getCentroids();
+ }
+
+ public static void main(String[] args) {
+ int k = 10;
+ int d = 1000;
+ int n = 10000;
+ float var = 1f;
+ int count = 5;
+ System.out.printf("Decoder: %s\n", "Sphere");
+ System.out.printf("ClusterVar\t");
+ for (int i = 0; i < count; i++)
+ System.out.printf("Trial%d\t", i);
+ System.out.printf("RealWCSS\n");
+
+ for (float f = var; f < 3.01; f += .05f) {
+ float avgrealwcss = 0;
+ float avgtime = 0;
+ System.out.printf("%f\t", f);
+ for (int i = 0; i < count; i++) {
+ GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f);
+ RPHashObject o = new SimpleArrayReader(gen.data, k);
+ Dis_PRPHashStream rphit = new Dis_PRPHashStream(o);
+// rphit.threads = 4;
+ o.setDecoderType(new Spherical(32, 4, 1));
+ // o.setDimparameter(31);
+ o.setOfflineClusterer(new KMeans2());
+ long startTime = System.nanoTime();
+ List centsr = rphit.getCentroids();
+ avgtime += (System.nanoTime() - startTime) / 100000000;
+
+ // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),
+ // gen.getData());
+
+ // System.out.printf("%.0f\t",
+ // StatTests.WCSSECentroidsFloat(centsr, gen.data));
+ // System.gc();
+
+ }
+ System.out.printf("%.0f\n", avgrealwcss / count);
+ }
+ }
+
+ @Override
+ public RPHashObject getParam() {
+ return so;
+ }
+
+ @Override
+ public void setWeights(List counts) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void setData(List centroids) {
+ this.centroids = centroids;
+
+ }
+
+ @Override
+ public void setRawData(List centroids) {
+ if (this.centroids == null)
+ this.centroids = new ArrayList<>(centroids.size());
+ for (float[] f : centroids) {
+ this.centroids.add(new Centroid(f, 0));
+ }
+ }
+
+ @Override
+ public void setK(int getk) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void reset(int randomseed) {
+ centroids = null;
+ so.setRandomSeed(randomseed);
+ }
+
+ @Override
+ public boolean setMultiRun(int runs) {
+ return true;
+ }
+}
diff --git a/src/main/java/edu/uc/rphash/PPAHStream.java b/src/main/java/edu/uc/rphash/PPAHStream.java
new file mode 100644
index 0000000..3970bc1
--- /dev/null
+++ b/src/main/java/edu/uc/rphash/PPAHStream.java
@@ -0,0 +1,810 @@
+package edu.uc.rphash;
+
+// This class will run the Parameter-free Projected Adaptive Hash Stream Clustering
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+//import java.util.Arrays;
+import java.util.HashMap;
+//import java.util.Iterator;
+//import java.util.LinkedHashMap;
+import java.util.List;
+//import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Random;
+import java.util.TreeSet;
+import java.util.stream.Stream;
+import java.util.Collections;
+
+import edu.uc.rphash.Readers.RPHashObject;
+import edu.uc.rphash.Readers.SimpleArrayReader;
+import edu.uc.rphash.kneefinder.JythonTest;
+import edu.uc.rphash.projections.Projector;
+import edu.uc.rphash.tests.StatTests;
+import edu.uc.rphash.tests.clusterers.Agglomerative3;
+import edu.uc.rphash.tests.clusterers.KMeans2;
+import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType;
+import edu.uc.rphash.tests.generators.GenerateData;
+import edu.uc.rphash.util.VectorUtil;
+import edu.uc.rphash.tests.clusterers.DBScan;
+import edu.uc.rphash.tests.clusterers.MultiKMPP;
+
+
+//import org.apache.commons.collections.map.MultiValueMap;
+//import org.apache.commons.collections.map.*;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Multimap;
+
+
+// https://www.javatips.net/api/webofneeds-master/webofneeds/won-matcher-solr/src/main/java/won/matcher/solr/utils/Kneedle.java
+// https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java
+
+// this algorithm runs twrp 3 times : (only the random bisection vector varies, the Projection matrix remains same)
+// and selects the one which has the best wcss offline for the 10X candidate centroids.
+
+
+public class PPAHStream implements Clusterer, Runnable {
+
+
+ List labels; // to directly output labels
+ HashMap labelmap; // to directly output labels
+ public List getLabels() {
+ for (int i = 0; i < labels.size(); i++) {
+ if (labelmap.containsKey(labels.get(i))) {
+ labels.set(i, labelmap.get(labels.get(i)));
+ } else {
+ labels.set(i, -1l);
+ }
+ }
+ return this.labels;
+ }
+
+
+
+ boolean znorm = false;
+ private int counter;
+ private float[] rngvec;
+ private float[] rngvec2;
+ private float[] rngvec3;
+ private float eps;
+
+ private List centroids = null;
+
+ private RPHashObject so;
+
+ public PPAHStream(RPHashObject so) {
+ this.so = so;
+ }
+
+ public List getCentroids(RPHashObject so) {
+ this.so = so;
+ return getCentroids();
+ }
+
+ @Override
+ public List getCentroids() {
+ if (centroids == null)
+ run();
+ return centroids;
+ }
+
+
+// This function returns the square of the euclidean distance.
+ public static float distancesq(float[] x, float[] y) {
+ if (x.length < 1)
+ return Float.MAX_VALUE;
+ if (y.length < 1)
+ return Float.MAX_VALUE;
+ float dist = (x[0] - y[0]) * (x[0] - y[0]);
+ for (int i = 1; i < x.length; i++)
+ dist += ((x[i] - y[i]) * (x[i] - y[i]));
+// return (float) Math.sqrt(dist);
+ return dist;
+ }
+
+
+ /*
+ * X - set of vectors compute the medoid of a vector set
+ */
+ float[] medoid(List X) {
+ float[] ret = X.get(0);
+ for (int i = 1; i < X.size(); i++) {
+ for (int j = 0; j < ret.length; j++) {
+ ret[j] += X.get(i)[j];
+ }
+ }
+ for (int j = 0; j < ret.length; j++) {
+ ret[j] = ret[j] / ((float) X.size());
+ }
+ return ret;
+ }
+
+// this updates the map two cents with different weights are merged into one.
+ public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1,
+ float cnt_2, float[] x_2 , float wcss_2) {
+
+ float cnt_r = cnt_1 + cnt_2;
+
+ float[] x_r = new float[x_1.length];
+
+ for (int i = 0; i < x_1.length; i++) {
+ x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r;
+
+ }
+// float wcss = (distancesq(x_r,x_2)/cnt_r) + wcss_1;
+// float wcss = ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_1);
+
+ float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) );
+
+// float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) );
+// System.out.println("wcsse = " + wcss);
+
+ float[][] ret = new float[3][];
+ ret[0] = new float[1];
+ ret[0][0] = cnt_r;
+ ret[1] = x_r;
+ ret[2]= new float [1];
+ ret[2][0]= wcss;
+ return ret;
+
+ }
+
+
+ public long hashvec2( float[] xt, float[] x,
+ HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) {
+ long s = 1; //fixes leading 0's bug
+ for (int i = 0; i < xt.length; i++) {
+ s = s << 1 ; // left shift the bits of s by 1.
+ if (xt[i] > rngvec[i])
+ s= s+1;
+
+ if (MapOfIDAndCent.containsKey(s)) {
+
+ float CurrentCount = MapOfIDAndCount.get(s);
+ float CurrentCent [] = MapOfIDAndCent.get(s);
+ float CountForIncomingVector = 1;
+ float IncomingVector [] = x;
+ float currentWcss= MapOfIDAndWCSS.get(s);
+ float incomingWcss= 0;
+
+ float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss );
+
+ Long UpdatedCount = (long) MergedValues[0][0] ;
+
+ float[] MergedVector = MergedValues[1] ;
+
+ float wcss= MergedValues[2][0];
+
+ MapOfIDAndCount.put(s , UpdatedCount);
+
+ MapOfIDAndCent.put(s, MergedVector);
+
+ MapOfIDAndWCSS.put(s, wcss);
+
+ }
+
+ else {
+
+ float[] xlist = x;
+ MapOfIDAndCent.put(s, xlist);
+ MapOfIDAndCount.put(s, (long)1);
+ MapOfIDAndWCSS.put(s, (float)0);
+ }
+ }
+ return s;
+ }
+
+
+
+ /*
+ * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid
+ * vector map
+ *
+ * hash the projected vector x and update the hash to centroid and counts
+ * maps
+ */
+ void addtocounter(float[] x, Projector p,
+ HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) {
+ float[] xt = p.project(x);
+
+ hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS);
+ }
+
+
+ static boolean isPowerOfTwo(long num) {
+ return (num & -num) == num;
+ }
+
+
+ public void printHashmap(HashMap hashmap) {
+
+ System.out.println(hashmap.keySet());
+ System.out.println(hashmap.values());
+
+ }
+public void printStream(Stream> stream) {
+
+ //System.out.println(hashmap.keySet());
+ System.out.println(stream.count());
+
+}
+// this method calculates the epsilon value and prints the information.
+public float printInfo(ListsetofKeys, HashMap MapOfIDAndCount, HashMap MapOfIDAndCent, HashMap MapOfIDAndWCSS) {
+
+ List counts = new ArrayList<>();
+// List wcsseprint = new ArrayList<>();
+ List wcsseprint = new ArrayList<>();
+// float temp = 0;
+ int elements=0;
+ float avg=0;
+
+ for (Long keys: setofKeys)
+ {
+ elements=elements+1;
+//// System.out.println(MapOfIDAndCount.get(keys));
+ counts.add(MapOfIDAndCount.get(keys));
+ wcsseprint.add(MapOfIDAndWCSS.get(keys).longValue());
+
+ }
+// System.out.println();
+ System.out.print(counts);
+
+
+
+// for (Long keys: setofKeys)
+// {
+// System.out.println(MapOfIDAndWCSS.get(keys));
+// wcsseprint.add(MapOfIDAndWCSS.get(keys));
+// }
+
+ // calculation of epsilon
+ /*
+ for (int i=0 ; i<(0.8*elements); i++) //for (int i=0 ; i<(0.8*elements); i++)
+ {
+ temp = temp + (wcsseprint.get(i))/(counts.get(i));
+ }
+ avg = (float) (temp/(0.8*elements));
+ System.out.println();
+ System.out.println("\taverage epsilon = "+ avg);
+ */
+ Collections.sort(wcsseprint);
+ Collections.reverse(wcsseprint);
+ System.out.println();
+ System.out.println(wcsseprint);
+ System.out.println();
+
+ JythonTest elbowcalculator = new JythonTest();
+ int num_of_clusters= elbowcalculator.find_elbow(wcsseprint);
+ //int num_of_clusters_2= elbowcalculator.find_elbow(counts);
+ System.out.println("\n" + "No. of clusters_by_WCSS = " + num_of_clusters);
+ //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2);
+ System.out.println( "************************************************************" );
+
+
+ return (avg);
+ }
+
+ /*
+ * X - data set k - canonical k in k-means l - clustering sub-space Compute
+ * density mode via iterative deepening hash counting
+ */
+
+ public Multimap findDensityModes2() {
+ //public Map findDensityModes2() {
+ HashMap MapOfIDAndCent1 = new HashMap<>();
+ HashMap MapOfIDAndCount1 = new HashMap<>();
+ HashMap MapOfIDAndWCSS1 = new HashMap<>();
+
+ HashMap MapOfIDAndCent2 = new HashMap<>();
+ HashMap MapOfIDAndCount2 = new HashMap<>();
+ HashMap MapOfIDAndWCSS2 = new HashMap<>();
+
+ HashMap MapOfIDAndCent3 = new HashMap<>();
+ HashMap MapOfIDAndCount3 = new HashMap<>();
+ HashMap MapOfIDAndWCSS3 = new HashMap<>();
+
+
+ // #create projector matrixs
+ Projector projector = so.getProjectionType();
+ projector.setOrigDim(so.getdim());
+ projector.setProjectedDim(so.getDimparameter());
+
+ projector.setRandomSeed(so.getRandomSeed());
+ //projector.setRandomSeed(949124732);
+
+ projector.init();
+ int cutoff = so.getCutoff();
+
+ int ct = 0;
+ int ct2 = 0;
+ int ct3 =0;
+
+ {
+
+ for (float[] x : so.getRawData())
+ {
+ addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1);
+ addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct2++, rngvec2,MapOfIDAndWCSS2);
+ addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct3++, rngvec3,MapOfIDAndWCSS3);
+
+ }
+ }
+
+ System.out.println("\nNumberOfVectors = , "+ ct);
+ System.out.println("\nNumberOfMicroClustersBeforePruning = , "+ MapOfIDAndCent1.size());
+ //printHashmap(MapOfIDAndCount1);
+
+ // next we want to prune the tree by parent count comparison
+ // follows breadthfirst search
+
+ HashMap denseSetOfIDandCount2_1 = new HashMap();
+ for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet()))
+ {
+ //if (cur_id >so.getk()){
+ if (cur_id > Long.valueOf(3)){
+ int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue());
+ long parent_id = cur_id>>>1;
+ int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue());
+
+ if(cur_count!=0 && parent_count!=0)
+ {
+ if(cur_count == parent_count) {
+ denseSetOfIDandCount2_1.put(parent_id, 0L);
+ // IDAndCent.put(parent_id, new ArrayList<>());
+
+ MapOfIDAndCent1.put(parent_id, new float[]{});
+
+ // MapOfIDAndCount1.put(parent_id, new Long (0));
+
+ denseSetOfIDandCount2_1.put(cur_id, (long) cur_count);
+
+ }
+ else
+ {
+ if(2 * cur_count > parent_count) {
+ denseSetOfIDandCount2_1.remove(parent_id);
+
+ // IDAndCent.put(parent_id, new ArrayList<>());
+ MapOfIDAndCent1.put(parent_id, new float[]{});
+ // MapOfIDAndCount.put(parent_id, new Long (0));
+
+ denseSetOfIDandCount2_1.put(cur_id, (long) cur_count);
+ }
+ }
+ }
+ }
+ }
+
+ HashMap denseSetOfIDandCount2_2 = new HashMap();
+ for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet()))
+ {
+
+ //if (cur_id >so.getk()){
+ if (cur_id > Long.valueOf(7)){
+ int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue());
+ long parent_id = cur_id>>>1;
+ int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue());
+
+ if(cur_count!=0 && parent_count!=0)
+ {
+ if(cur_count == parent_count) {
+ denseSetOfIDandCount2_2.put(parent_id, 0L);
+ // IDAndCent.put(parent_id, new ArrayList<>());
+
+ MapOfIDAndCent2.put(parent_id, new float[]{});
+
+ // MapOfIDAndCount2.put(parent_id, new Long (0));
+
+ denseSetOfIDandCount2_2.put(cur_id, (long) cur_count);
+
+ }
+ else
+ {
+ if(2 * cur_count > parent_count) {
+ denseSetOfIDandCount2_2.remove(parent_id);
+
+ // IDAndCent.put(parent_id, new ArrayList<>());
+ MapOfIDAndCent2.put(parent_id, new float[]{});
+ // MapOfIDAndCount2.put(parent_id, new Long (0));
+
+ denseSetOfIDandCount2_2.put(cur_id, (long) cur_count);
+ }
+ }
+ }
+ }
+ }
+
+
+ HashMap denseSetOfIDandCount2_3 = new HashMap();
+ for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet()))
+ {
+ //if (cur_id >so.getk()){
+ if (cur_id > Long.valueOf(11)){
+ int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue());
+ long parent_id = cur_id>>>1;
+ int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue());
+
+ if(cur_count!=0 && parent_count!=0)
+ {
+ if(cur_count == parent_count) {
+ denseSetOfIDandCount2_3.put(parent_id, 0L);
+ // IDAndCent.put(parent_id, new ArrayList<>());
+
+ MapOfIDAndCent3.put(parent_id, new float[]{});
+
+ // MapOfIDAndCount.put(parent_id, new Long (0));
+
+ denseSetOfIDandCount2_3.put(cur_id, (long) cur_count);
+
+ }
+ else
+ {
+ if(2 * cur_count > parent_count) {
+ denseSetOfIDandCount2_3.remove(parent_id);
+
+ // IDAndCent.put(parent_id, new ArrayList<>());
+ MapOfIDAndCent3.put(parent_id, new float[]{});
+ // MapOfIDAndCount.put(parent_id, new Long (0));
+
+ denseSetOfIDandCount2_3.put(cur_id, (long) cur_count);
+ }
+ }
+ }
+ }
+ }
+
+
+ //remove keys with support less than 1
+ Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1);
+ List sortedIDList2_1= new ArrayList<>();
+ // sort and limit the list
+ stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff)
+ .forEachOrdered(x -> sortedIDList2_1.add(x.getKey()));
+ // printHashmap(denseSetOfIDandCount2_1);
+
+
+ Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1);
+ List sortedIDList2_2= new ArrayList<>();
+ // sort and limit the list
+ stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff)
+ .forEachOrdered(x -> sortedIDList2_2.add(x.getKey()));
+ //printHashmap(denseSetOfIDandCount2_2);
+
+
+ Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1);
+ List sortedIDList2_3= new ArrayList<>();
+ // sort and limit the list
+ stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff)
+ .forEachOrdered(x -> sortedIDList2_3.add(x.getKey()));
+ //printHashmap(denseSetOfIDandCount2_3);
+
+ float WCSS1 = 0;
+ float WCSS2 = 0;
+ float WCSS3 = 0;
+
+ HashMap denseSetOfIDandCount2 = new HashMap();
+
+ HashMap MapOfIDAndCent = new HashMap<>();
+ HashMap MapOfIDAndCount = new HashMap<>();
+ HashMap MapOfIDAndWCSS = new HashMap<>();
+
+
+
+
+ //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE:
+
+ for (Long keys: sortedIDList2_1)
+// for (Long cur_id : (((HashMap) stream2_1).keySet()))
+ { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id));
+ WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);}
+
+// for (Long cur_id : (denseSetOfIDandCount2_2.keySet()))
+ for (Long keys: sortedIDList2_2)
+ { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);}
+
+// for (Long cur_id : (denseSetOfIDandCount2_3.keySet()))
+ for (Long keys: sortedIDList2_3)
+ { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);}
+
+
+ System.out.println("wcss1(online calc) of candidate cents = , " + WCSS1);
+// System.out.println(" wcss_ofline_calc_1 = " + WCSS_off_1);
+
+ System.out.println("wcss1(online calc) of candidate cents = , " + WCSS2);
+// System.out.println(" wcss_ofline_calc_2 = " + WCSS_off_2);
+
+ System.out.println("wcss1(online calc) of candidate cents = , " + WCSS3);
+// System.out.println(" wcss_ofline_calc_3 = " + WCSS_off_3);
+
+ if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3))
+ {MapOfIDAndCount = MapOfIDAndCount1;
+ MapOfIDAndCent = MapOfIDAndCent1;
+ MapOfIDAndWCSS = MapOfIDAndWCSS1;
+ denseSetOfIDandCount2 = denseSetOfIDandCount2_1;
+ System.out.println("winner = tree1");
+ }
+ else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3))
+ {MapOfIDAndCount = MapOfIDAndCount2;
+ MapOfIDAndCent = MapOfIDAndCent2;
+ MapOfIDAndWCSS = MapOfIDAndWCSS2;
+ denseSetOfIDandCount2 = denseSetOfIDandCount2_2;
+ System.out.println("winner = tree2");
+ }
+ else
+ {MapOfIDAndCount = MapOfIDAndCount3;
+ MapOfIDAndCent = MapOfIDAndCent3;
+ MapOfIDAndWCSS = MapOfIDAndWCSS3;
+ denseSetOfIDandCount2 = denseSetOfIDandCount2_3;
+ System.out.println("winner = tree3");
+
+ }
+
+ System.out.println("NumberOfMicroClusters_AfterPruning_&_beforesortingLimit = , "+ denseSetOfIDandCount2.size());
+
+ //remove keys with support less than 1
+ Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 2);
+
+ List sortedIDList2= new ArrayList<>();
+ // sort and limit the list
+ stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff)
+ .forEachOrdered(x -> sortedIDList2.add(x.getKey()));
+
+ System.out.println("------------------------------------------------------------------------------------------------------------------");
+ //printHashmap(denseSetOfIDandCount2);
+ float eps= printInfo(sortedIDList2,denseSetOfIDandCount2, MapOfIDAndCent,MapOfIDAndWCSS);
+// seteps(eps);
+
+
+
+ Multimap multimapWeightAndCent = ArrayListMultimap.create();
+
+ for (Long keys: sortedIDList2)
+
+ {
+
+ multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys)));
+
+ }
+
+
+ return multimapWeightAndCent;
+
+}
+
+ public void run() {
+ rngvec = new float[so.getDimparameter()];
+
+ rngvec2 = new float[so.getDimparameter()];
+
+ rngvec3 = new float[so.getDimparameter()];
+
+ counter = 0;
+ boolean randVect = so.getRandomVector();
+
+ // Random r = new Random(so.getRandomSeed());
+ // Random r = new Random(3800635955020675334L) ;
+
+ Random r = new Random();
+ //Random r = new Random(923063597592675214L) ;
+ Random r2 = new Random();
+ //Random r2 = new Random(923063597592675214L) ;
+ Random r3 = new Random();
+ //Random r3 = new Random(923063597592675214L) ;
+
+ if (randVect==true){
+ for (int i = 0; i < so.getDimparameter(); i++) {
+ rngvec[i] = (float) r.nextGaussian();
+ //System.out.println(rngvec[i]);
+ }
+ for (int i = 0; i < so.getDimparameter(); i++)
+ rngvec2[i] = (float) r2.nextGaussian();
+
+ for (int i = 0; i < so.getDimparameter(); i++)
+ rngvec3[i] = (float) r3.nextGaussian();
+
+ } else {
+ for (int i = 0; i < so.getDimparameter(); i++)
+ rngvec[i] = (float) 0;
+ }
+
+ Multimap WeightAndClusters = findDensityModes2();
+
+ Listcentroids2 = new ArrayList<>();
+ List weights2 =new ArrayList<>();
+
+ System.out.println("\tNumberOfMicroClusters_AfterPruning = , "+ WeightAndClusters.size());
+// System.out.println("getRandomVector = "+ randVect);
+
+ for (Long weights : WeightAndClusters.keys())
+ {
+
+ weights2.add((float)weights);
+
+ }
+
+
+ for (Long weight : WeightAndClusters.keySet())
+
+ {
+
+ centroids2.addAll(WeightAndClusters.get(weight));
+
+ }
+
+// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk());
+// aggloOffline.setWeights(weights2);
+// this.centroids = aggloOffline.getCentroids();
+
+ KMeans2 aggloOffline2 = new KMeans2();
+ aggloOffline2.setK(so.getk());
+ aggloOffline2.setRawData(centroids2);
+// aggloOffline2.setWeights(weights2);
+ this.centroids = aggloOffline2.getCentroids();
+
+// MultiKMPP aggloOffline3 = new MultiKMPP(centroids2,so.getk());
+// this.centroids = aggloOffline3.getCentroids();
+
+//// DBScan algo = new DBScan(centroids2, (eps/(20)), 3);
+//// System.out.println("epsssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss = "+ eps/(20));
+//// this.centroids = algo.getCentroids();
+//// System.out.println("no. of final output centroids = "+ centroids.size());
+
+ }
+
+ public static void main(String[] args) throws FileNotFoundException,
+ IOException, InterruptedException {
+
+ System.gc();
+
+ // int k ; //= 10;
+ // int d = 200;//16;
+ // int n = 10000;
+ // float var = 1.5f;
+ // int count = 1;
+ // System.out.printf("ClusterVar\t");
+ // for (int i = 0; i < count; i++)
+ // System.out.printf("Trial%d\t", i);
+ // System.out.printf("RealWCSS\n");
+
+ // String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ;
+
+ // float f = var;
+ // float avgrealwcss = 0;
+ float avgtime = 0;
+ // System.out.printf("%f\t", f);
+ // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f);
+
+ // gen.writeCSVToFile(new File("/C:/Users/deysn/Desktop/temp/run_results/3runs/rough/1D.txt"));
+ // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1"
+
+ // RPHashObject o = new SimpleArrayReader(gen.data, k);
+
+ boolean raw = Boolean.parseBoolean(("raw"));
+ List data = null;
+ // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; C:/Users/deysn/Documents/temp/covtype/1D.txt
+ // C:/Users/dey.sn/Downloads/temp/covtype/1D.csv ; "C:/Users/dey.sn/Downloads/temp/run_results/3runs/har_k6/1D.txt"
+ //String inputfile = "C:/Users/dey.sn/Downloads/temp/crop_mapping/1D.csv" ;
+ String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/sensorless_drive/1D.csv" ;
+ System.out.println(inputfile);
+ data = VectorUtil.readFile( inputfile , raw);
+ for (int k=10; k<=10 ;k++)
+ {
+ for (int i = 1; i <= 5; i++)
+ {
+ //k = 7;
+
+ RPHashObject o = new SimpleArrayReader(data, k);
+
+ o.setDimparameter(16);
+ o.setCutoff(250); //230
+ o.setRandomVector(true);
+
+// System.out.println("cutoff = "+ o.getCutoff());
+// System.out.println("get_random_Vector = "+ o.getRandomVector());
+
+// TWRPv6_wcss_offline2_TEST rphit = new TWRPv6_wcss_offline2_TEST(o);
+ PPAHStream rphit = new PPAHStream(o);
+
+
+ System.gc();
+
+ Runtime rt = Runtime.getRuntime();
+ rt.gc();
+ Thread.sleep(10);
+ rt.gc();
+ long startmemory = rt.totalMemory() - rt.freeMemory();
+ long startTime = System.nanoTime();
+
+ List centsr = rphit.getCentroids();
+
+ avgtime += (System.nanoTime() - startTime) / 1000000000f ;
+
+ float usedMB = ((rt.totalMemory() - rt.freeMemory()) - startmemory) / (1024*1024);
+
+ System.out.println(" Time(in sec), " + avgtime + ", Mem_Used(MB):, " + (usedMB/3) );
+
+ rt.gc();
+ Thread.sleep(10);
+ rt.gc();
+
+// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData());
+// String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ;
+ //String Output = "C:/Users/dey.sn/Downloads/work/output/cropmap_k7/cropmap_k7_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ;
+ String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ;
+ VectorUtil.writeCentroidsToFile(new File(Output),centsr, false);
+
+// VectorUtil.writeVectorFile(new File(Output+"_"+"labels"+".txt"), centsr.getLabels());
+
+
+// System.out.printf("WCSS for generated data = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data));
+ System.out.printf(",WCSS for Winning Kmeans, = , "+ "%.0f ", StatTests.WCSSECentroidsFloat(centsr, data));
+ System.out.println(",k, is: , "+k);
+//
+ System.gc();
+ }
+ }
+
+// System.out.printf("%.0f\n", avgrealwcss / count);
+
+
+ }
+
+ @Override
+ public RPHashObject getParam() {
+ return so;
+ }
+
+ @Override
+ public void setWeights(List counts) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void setData(List centroids) {
+ this.centroids = centroids;
+
+ }
+
+ @Override
+ public void setRawData(List centroids) {
+ if (this.centroids == null)
+ this.centroids = new ArrayList<>(centroids.size());
+ for (float[] f : centroids) {
+ this.centroids.add(new Centroid(f, 0));
+ }
+ }
+
+ @Override
+ public void setK(int getk) {
+ this.so.setK(getk);
+ }
+
+ @Override
+ public void reset(int randomseed) {
+ centroids = null;
+ so.setRandomSeed(randomseed);
+ }
+
+ @Override
+ public boolean setMultiRun(int runs) {
+ return false;
+ }
+
+ //@Override
+ public void setCutoff(int getCutoff) {
+ this.so.setCutoff(getCutoff);
+ }
+
+ //@Override
+ public void setRandomVector(boolean getRandomVector) {
+ this.so.setRandomVector(getRandomVector);
+ }
+
+ public void seteps(float eps) {
+ this.eps=eps;
+ }
+}
diff --git a/src/main/java/edu/uc/rphash/PPAHStream_v2.java b/src/main/java/edu/uc/rphash/PPAHStream_v2.java
new file mode 100644
index 0000000..8082479
--- /dev/null
+++ b/src/main/java/edu/uc/rphash/PPAHStream_v2.java
@@ -0,0 +1,320 @@
+package edu.uc.rphash;
+
+/*
+ This class will run the Parameter-free Projected Adaptive Hash Stream Clustering
+ */
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Random;
+import java.util.Map.Entry;
+import java.util.TreeSet;
+import java.util.stream.Stream;
+
+import edu.uc.rphash.Readers.RPHashObject;
+import edu.uc.rphash.Readers.SimpleArrayReader;
+import edu.uc.rphash.projections.Projector;
+import edu.uc.rphash.tests.StatTests;
+import edu.uc.rphash.tests.generators.GenerateStreamData;
+
+
+
+
+public class PPAHStream_v2 implements StreamClusterer {
+
+
+ private float[] rngvec;
+ private List centroids = null;
+ private RPHashObject so;
+ // #create projector matrixs
+ Projector projector ;
+ int ct=0;
+ int pdim = 20;
+
+ public PPAHStream_v2(int k, GenerateStreamData gen, int i) {
+ so = new SimpleArrayReader(gen,k);
+ projector = so.getProjectionType();
+ projector.setOrigDim(so.getdim());
+ projector.setProjectedDim(pdim);
+ projector.setRandomSeed(so.getRandomSeed());
+ projector.init();
+ initTablesWith();
+ }
+
+ public List getCentroids(RPHashObject so) {
+ this.so = so;
+ return getCentroids();
+ }
+
+
+ /*
+ * X - set of vectors compute the medoid of a vector set
+ */
+ /** Add vector to running Centroid
+ * @param cnt_1,cnt_2
+ * @param x_1
+ */
+ public static float[] update_cent(int ct, float[] x, float[] cent){
+ for(int i=0;i 0)
+ s += 1;
+ addcent(s,x);
+ }
+ return s;
+ }
+
+
+ /*
+ * ===========================MinCount Sketch=======================
+ */
+ public static final long PRIME_MODULUS = (1L << 31) - 1;
+ private int depth;
+ private int width;
+ private int[][] tableS;
+ private float[][][] tableCent;
+ private long[] hashA;
+
+
+ private void initTablesWith() {
+ this.width = (int) Math.ceil(2 / .025);
+ this.depth = (int) Math.ceil(-Math.log(1 - .97) / Math.log(2));
+ this.tableS = new int[depth][width];
+ this.tableCent = new float[depth][width][];//we will fill these in as we need them
+ this.hashA = new long[depth];//hash offsets
+ Random r = new Random();
+ for (int i = 0; i < depth; ++i) {
+ hashA[i] = r.nextLong();
+ }
+ }
+
+ private int hash(long item, int i) {
+ long hash = hashA[i] * item;
+ hash += hash >>> 32;
+ hash &= PRIME_MODULUS;
+ return (int) (hash % width);
+
+ }
+
+ private int count(long lshhash) {
+ int min = (int) tableS[0][hash(lshhash, 0)];
+ for (int i = 1; i < depth; ++i) {
+ if (tableS[i][hash(lshhash, i)] < min)
+ min = (int) tableS[i][hash(lshhash, i)];
+ }
+ return min;
+ }
+
+ private float[] get_cent_sketch(long lshhash) {
+ int min = (int) tableS[0][hash(lshhash, 0)];
+ int mini = 0;
+ int minhtmp = 0;
+ for (int i = 1; i < depth; ++i) {
+ int htmp = hash(lshhash, i);
+ if (tableS[i][hash(lshhash, i)] < min){
+ mini = i;
+ minhtmp = htmp;
+ min = (int) tableS[i][htmp];
+ }
+ }
+
+ return tableCent[mini][minhtmp];
+ }
+
+ private void addcent(long lshhash, float[] x){
+
+ int htmp = hash(lshhash, 0);
+ int argmini = 0;
+ int argminhtmp = htmp;
+
+ tableS[0][htmp] += 1;
+ int min = (int) tableS[0][htmp];
+
+ for (int i = 1; i < depth; ++i) {
+ htmp = hash(lshhash, i);
+ tableS[i][htmp] += 1;
+
+ if (tableS[i][htmp] < min){
+ min = (int) tableS[i][htmp];
+ argmini = i;
+ argminhtmp = htmp;
+ }
+ }
+
+ if(tableCent[argmini][argminhtmp]==null){
+ tableCent[argmini][argminhtmp] = x;
+ }
+ else{
+ update_cent(min, x, tableCent[argmini][argminhtmp]);
+ }
+ }
+ /*
+ * ===========================MinCount Sketch=======================
+ */
+
+
+
+ /*
+ * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid
+ * vector map
+ *
+ * hash the projected vector x and update the hash to centroid and counts
+ * maps
+ */
+ void addtocounter(float[] x, Projector p) {
+ float[] xt = p.project(x);
+ hashvec(xt, x);
+ }
+
+ @Override
+ public long addVectorOnlineStep(float[] x) {
+ addtocounter(x, projector);
+ return 0;
+ }
+
+ @Override
+ public List getCentroidsOfflineStep() {
+
+ // next we want to prune the tree by parent count comparison
+ // follows breadthfirst search
+ HashMap densityAndID = new HashMap();
+ for (Long cur_id =0l;cur_id<2<>> 1;
+ long parent_count = count(parent_id);
+
+ if (2 * cur_count > parent_count) {
+ densityAndID.put(parent_id, 0l);
+ densityAndID.put(cur_id,cur_count);
+ }
+ }
+
+ //remove keys with support less than 2
+ Stream> stream = densityAndID.entrySet().stream().filter(p -> p.getValue() > 1);
+ //64 so 6 bits?
+ //stream = stream.filter(p -> p.getKey() > 64);
+
+ List sortedIDList= new ArrayList<>();
+ // sort and limit the list
+ stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*1)
+ .forEachOrdered(x -> sortedIDList.add(x.getKey()));
+
+ // compute centroids
+ List estcents = new ArrayList<>();
+ for (int i = 0; i < sortedIDList.size(); i++) {
+ System.out.println(densityAndID.get(sortedIDList.get(i)));
+ if(get_cent_sketch(sortedIDList.get(i))!=null)
+ estcents.add(new Centroid( get_cent_sketch(sortedIDList.get(i))));
+ }
+
+ return estcents;
+ }
+
+ @Override
+ public void shutdown() {
+ }
+
+ @Override
+ public int getProcessors() {
+ return 0;
+ }
+
+ @Override
+ public List getCentroids() {
+ return null;
+ }
+
+
+ public static void main(String[] args) throws Exception {
+
+ int k = 10;
+ int d = 100;
+ int interval = 1000;
+ float var = 1f;
+
+ Runtime rt = Runtime.getRuntime();
+ GenerateStreamData gen = new GenerateStreamData(k, d, var, 1133131);
+
+ StreamClusterer rphit = new PPAHStream_v2(k, gen, 1);
+ //StreamClusterer rphit = new RPHashStreaming(k, gen, 1);
+
+ ArrayList vecsInThisRound = new ArrayList();
+
+ System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\n");
+ long timestart = System.nanoTime();
+ for (int i = 0; i < interval * 6; i++) {
+ vecsInThisRound.add(gen.generateNext());
+ if (i % interval == interval - 1) {
+ timestart = System.nanoTime();
+ for (float[] f : vecsInThisRound) {
+ rphit.addVectorOnlineStep(f);
+ }
+
+ List cents = rphit.getCentroidsOfflineStep();
+ long time = System.nanoTime() - timestart;
+ rt.gc();
+ long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024;
+ double wcsse = StatTests.WCSSECentroidsFloat(cents, vecsInThisRound);
+ vecsInThisRound = new ArrayList();
+ System.out.printf("%d\t%d\t%.4f\t%.4f\n", i, usedkB,
+ time / 1000000000f, wcsse);
+ }
+ }
+ }
+ @Override
+ public RPHashObject getParam() {
+ return so;
+ }
+
+ @Override
+ public void setWeights(List counts) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void setData(List centroids) {
+ this.centroids = centroids;
+
+ }
+
+ @Override
+ public void setRawData(List centroids) {
+ if (this.centroids == null)
+ this.centroids = new ArrayList<>(centroids.size());
+ for (float[] f : centroids) {
+ this.centroids.add(new Centroid(f, 0));
+ }
+ }
+
+ @Override
+ public void setK(int getk) {
+ this.so.setK(getk);
+ }
+
+ @Override
+ public void reset(int randomseed) {
+ centroids = null;
+ so.setRandomSeed(randomseed);
+ }
+
+ @Override
+ public boolean setMultiRun(int runs) {
+ // TODO Auto-generated method stub
+ return false;
+ }
+
+}
diff --git a/src/main/java/edu/uc/rphash/PRPHashStream.java b/src/main/java/edu/uc/rphash/PRPHashStream.java
new file mode 100644
index 0000000..a906431
--- /dev/null
+++ b/src/main/java/edu/uc/rphash/PRPHashStream.java
@@ -0,0 +1,279 @@
+package edu.uc.rphash;
+/*
+This class will run the Parameter-free Random Projection Hash Stream Clustering
+*/
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import java.util.concurrent.TimeUnit;
+
+import edu.uc.rphash.Readers.RPHashObject;
+import edu.uc.rphash.Readers.SimpleArrayReader;
+import edu.uc.rphash.concurrent.VectorLevelConcurrency;
+import edu.uc.rphash.decoders.Decoder;
+import edu.uc.rphash.frequentItemSet.KHHCentroidCounter;
+//import edu.uc.rphash.frequentItemSet.KHHCountMinSketch.Tuple;
+import edu.uc.rphash.lsh.LSH;
+//import edu.uc.rphash.projections.DBFriendlyProjection;
+import edu.uc.rphash.projections.Projector;
+import edu.uc.rphash.standardhash.HashAlgorithm;
+import edu.uc.rphash.standardhash.MurmurHash;
+import edu.uc.rphash.tests.StatTests;
+import edu.uc.rphash.tests.clusterers.KMeansPlusPlus;
+import edu.uc.rphash.tests.generators.ClusterGenerator;
+import edu.uc.rphash.tests.generators.GenerateStreamData;
+
+public class PRPHashStream implements StreamClusterer {
+ public List is;
+ public List lshfuncs;
+ private StatTests vartracker;
+ private List> centroids = null;
+ private List bestcentroids = null;
+ private RPHashObject so;
+ ExecutorService executor;
+ private final int processors;
+ private int concurrentRuns;
+
+ boolean initialized=false;
+ @Override
+ public int getProcessors() {
+ return processors;
+ }
+
+ @Override
+ public long addVectorOnlineStep(final float[] vec) {
+ if(!initialized){
+ System.out.println("Not initialized!");
+ try {
+ Thread.sleep(100);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ for(int i = 0;i(concurrentRuns);
+ lshfuncs = new ArrayList(concurrentRuns);
+ for(int i = 0;i noise = LSH.genNoiseTable(dec.getDimensionality(),
+ so.getNumBlur(), r, dec.getErrorRadius()
+ / dec.getDimensionality());
+ lshfunc[projidx] = new LSH(dec, p, hal, noise,so.getNormalize());
+ }
+ lshfuncs.add(lshfunc);
+ }
+ initialized = true;
+ }
+
+ public PRPHashStream(int k, ClusterGenerator c) {
+ so = new SimpleArrayReader(c, k);
+ if (so.getParallel())
+ this.processors = Runtime.getRuntime().availableProcessors();
+ else
+ this.processors = 1;
+ executor = Executors.newFixedThreadPool(this.processors );
+ init();
+ }
+
+ public PRPHashStream(List data, int k) {
+ so = new SimpleArrayReader(data, k);
+ if (so.getParallel())
+ this.processors = Runtime.getRuntime().availableProcessors();
+ else
+ this.processors = 1;
+ executor = Executors.newFixedThreadPool(this.processors );
+ init();
+ }
+
+ public PRPHashStream(RPHashObject so) {
+ this.so = so;
+ if (so.getParallel())
+ this.processors = Runtime.getRuntime().availableProcessors();
+ else
+ this.processors = 1;
+ executor = Executors.newFixedThreadPool(this.processors );
+ init();
+ }
+
+ public PRPHashStream(int k, GenerateStreamData c, int processors) {
+ so = new SimpleArrayReader(c, k);
+ if (so.getParallel())
+ this.processors = processors;
+ else
+ this.processors = 1;
+ executor = Executors.newFixedThreadPool(this.processors );
+ init();
+ }
+
+ @Override
+ public List getCentroids() {
+ if (centroids == null) {
+ init();
+ run();
+ getCentroidsOfflineStep();
+ }
+ return bestcentroids;
+ }
+
+ public List getCentroidsOfflineStep() {
+ if (so.getParallel()) {
+ executor.shutdown();
+ try {
+ executor.awaitTermination(10, TimeUnit.SECONDS);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ executor = Executors.newFixedThreadPool(this.processors);
+ }
+
+ bestcentroids = new ArrayList();
+// List projIDs = new ArrayList();
+// List cents = is.getTop();
+// List counts = is.getCounts();
+//
+ List cents = new ArrayList();
+ int i = 0;
+ //get rid of size one clusters that are there just because they were added to the list at the end
+ for (; i < is.size() ; i++) {
+// if(is.get(i).count==1)break;
+ cents.addAll(is.get(i).getTop());
+ }
+
+ ;
+// counts = counts.subList(0, i);
+ Clusterer offlineclusterer = new KMeansPlusPlus();
+ offlineclusterer.setData(cents);
+ offlineclusterer.setK(so.getk());
+ cents = offlineclusterer.getCentroids();
+
+
+
+// while(centroids.size()so.getk())cents = offlineclusterer.getCentroids();
+// if(cents.size() vecs = so.getVectorIterator();
+// while (vecs.hasNext()) {
+// if (so.getParallel()) {
+// float[] vec = vecs.next();
+// executor.execute(new VectorLevelConcurrency(vec, lshfuncs,is,so));
+// } else {
+// addVectorOnlineStep(vecs.next());
+// }
+// }
+ }
+
+ public List getTopIdSizes() {
+ return null;
+// return is.getCounts();
+ }
+
+ @Override
+ public RPHashObject getParam() {
+ return so;
+ }
+
+ @Override
+ public void setWeights(List counts) {
+
+ }
+
+ @Override
+ public void setRawData(List data)
+ {
+// this.centroids = new ArrayList(data.size());
+// for(float[] f: data){
+// this.data.add(new Centroid(f,0));
+// }
+ }
+
+ @Override
+ public void setData(List centroids) {
+ ArrayList data = new ArrayList(centroids.size());
+ for(Centroid c : centroids)data.add(c.centroid());
+ setRawData(data);
+ }
+
+
+ @Override
+ public void setK(int getk) {
+
+ }
+
+ @Override
+ public void shutdown() {
+ if (so.getParallel()) {
+ executor.shutdown();
+ try {
+// System.out.println("Shutting Down");
+ executor.awaitTermination(1200, TimeUnit.SECONDS);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ executor = Executors.newFixedThreadPool(this.processors );
+ }
+ }
+
+ @Override
+ public void reset(int randomseed) {
+ centroids = null;
+ so.setRandomSeed(randomseed);
+ }
+ @Override
+ public boolean setMultiRun(int runs) {
+ return false;
+ }
+
+}
diff --git a/src/main/java/edu/uc/rphash/RPHash.java b/src/main/java/edu/uc/rphash/RPHash.java
index e62b867..b0ed7be 100644
--- a/src/main/java/edu/uc/rphash/RPHash.java
+++ b/src/main/java/edu/uc/rphash/RPHash.java
@@ -22,11 +22,11 @@
import edu.uc.rphash.decoders.DepthProbingLSH;
import edu.uc.rphash.decoders.Dn;
import edu.uc.rphash.decoders.E8;
-import edu.uc.rphash.decoders.Golay;
+
import edu.uc.rphash.decoders.Leech;
import edu.uc.rphash.decoders.MultiDecoder;
import edu.uc.rphash.decoders.OriginDecoder;
-import edu.uc.rphash.decoders.PsdLSH;
+
import edu.uc.rphash.decoders.Spherical;
import edu.uc.rphash.projections.DBFriendlyProjection;
import edu.uc.rphash.projections.FJLTProjection;
@@ -34,13 +34,13 @@
import edu.uc.rphash.projections.NoProjection;
import edu.uc.rphash.projections.SVDProjection;
import edu.uc.rphash.tests.StatTests;
-import edu.uc.rphash.tests.clusterers.AdaptiveMeanShift;
+
import edu.uc.rphash.tests.clusterers.Agglomerative3;
import edu.uc.rphash.tests.clusterers.DummyClusterer;
import edu.uc.rphash.tests.clusterers.DBScan;
import edu.uc.rphash.tests.clusterers.KMeans2;
import edu.uc.rphash.tests.clusterers.KMeansPlusPlus;
-import edu.uc.rphash.tests.clusterers.LloydIterativeKmeans;
+
import edu.uc.rphash.tests.clusterers.MultiKMPP;
import edu.uc.rphash.tests.clusterers.StreamingKmeans;
import edu.uc.rphash.tests.clusterers.StreamingKmeans2;
@@ -51,17 +51,26 @@
public class RPHash {
static String[] clusteringmethods = { "simple", "streaming", "multiproj",
- "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" };
+ "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" ,"twrp" , "twrpbisect", "twrpbest", "twrpmergetree", "twrpbest_afterpruning",
+ "twrpbest_cov","twrpbest_meanvariance" };
+
static String[] offlineclusteringmethods = { "singlelink", "completelink",
"averagelink", "kmeans", "adaptivemeanshift", "kmpp", "multikmpp" , "dbscan", "none" };
+
static String[] projectionmethods = { "dbf", "fjlt", "rp", "svd", "noproj" };
+
static String[] ops = { "numprojections", "innerdecodermultiplier",
"numblur", "randomseed", "hashmod", "parallel", "streamduration",
"raw", "decayrate", "dimparameter", "decodertype",
"offlineclusterer", "runs", "normalize", "projection" };
+
static String[] decoders = { "dn", "e8", "golay", "multie8", "leech",
"multileech", "sphere", "levypstable", "cauchypstable",
"gaussianpstable", "adaptive", "origin" };
+
+ static String[] twrp_options = { "cutoff", "randomvector" };
+
+
public static void main(String[] args) throws NumberFormatException,
IOException, InterruptedException {
@@ -95,6 +104,12 @@ public static void main(String[] args) throws NumberFormatException,
System.out.print(s + " ,");
System.out.print("]\n");
+ System.out.print("\t twrp_options" + "\t:[");
+ for (String s : twrp_options)
+ System.out.print(s + " ,");
+ System.out.print("]\n");
+
+
System.exit(0);
}
@@ -114,6 +129,9 @@ public static void main(String[] args) throws NumberFormatException,
matched |= keyword.equals(match);
for (String match : decoders)
matched |= keyword.equals(match);
+ for (String match : twrp_options)
+ matched |= keyword.equals(match);
+
if (!matched)
unmatchedkeywords.add(keyword);
}
@@ -503,6 +521,20 @@ public static List runConfigs(List untaggedArgs,
o.setNormalize(Boolean.parseBoolean(taggedArgs.get("normalize")));
so.setNormalize(Boolean.parseBoolean(taggedArgs.get("normalize")));
}
+
+
+ if (taggedArgs.containsKey("cutoff")) {
+ o.setCutoff(Integer.parseInt(taggedArgs.get("cutoff")));
+ so.setCutoff(Integer.parseInt(taggedArgs.get("cutoff")));
+ }
+
+
+ if (taggedArgs.containsKey("randomvector")) {
+ o.setRandomVector(Boolean.parseBoolean(taggedArgs.get("randomvector")));
+ so.setRandomVector(Boolean.parseBoolean(taggedArgs.get("randomvector")));
+ }
+
+
if (taggedArgs.containsKey("projection")) {
switch (taggedArgs.get("projection")) {
@@ -552,11 +584,7 @@ public static List runConfigs(List untaggedArgs,
so.setDecoderType(new E8(2f));
break;
}
- case "golay": {
- o.setDecoderType(new Golay());
- so.setDecoderType(new Golay());
- break;
- }
+
case "multie8": {
o.setDecoderType(new MultiDecoder(
o.getInnerDecoderMultiplier() * 8, new E8(2f)));
@@ -576,23 +604,9 @@ public static List runConfigs(List untaggedArgs,
.getInnerDecoderMultiplier() * 24, new Leech()));
break;
}
- case "levypstable": {
- o.setDecoderType(new PsdLSH(PsdLSH.LEVY, o.getDimparameter()));
- so.setDecoderType(new PsdLSH(PsdLSH.LEVY, o.getDimparameter()));
- break;
- }
- case "cauchypstable": {
- o.setDecoderType(new PsdLSH(PsdLSH.CAUCHY, o.getDimparameter()));
- so.setDecoderType(new PsdLSH(PsdLSH.CAUCHY, o.getDimparameter()));
- break;
- }
- case "gaussianpstable": {
- o.setDecoderType(new PsdLSH(PsdLSH.GAUSSIAN, o
- .getDimparameter()));
- so.setDecoderType(new PsdLSH(PsdLSH.GAUSSIAN, o
- .getDimparameter()));
- break;
- }
+
+
+
case "sphere": {// pad to ~32 bits
// int ctsofsphere =
// (int)(Math.log(o.getDimparameter()*2)/Math.log(2.0)) /2;
@@ -659,13 +673,7 @@ public static List runConfigs(List untaggedArgs,
o.setOfflineClusterer(new KMeansPlusPlus());
so.setOfflineClusterer(new KMeansPlusPlus());
break;
- case "adaptivemeanshift": {
-
- o.setOfflineClusterer(new AdaptiveMeanShift());
- so.setOfflineClusterer(new AdaptiveMeanShift());
-
- break;
- }
+
case "kmpp": {
o.setOfflineClusterer(new KMeansPlusPlus());
@@ -727,11 +735,7 @@ public static List runConfigs(List untaggedArgs,
runitems.add(new KMeans2(k, o.getRawData()));
break;
}
- case "pkmeans":
- runitems.add(new LloydIterativeKmeans(k, o.getRawData(), o
- .getNumProjections()));
- break;
-
+
case "kmeansplusplus":
runitems.add(new KMeansPlusPlus(o.getRawData(), k));
break;
@@ -743,14 +747,50 @@ public static List runConfigs(List untaggedArgs,
runitems.add(new StreamingKmeans2(o));
break;
}
- case "adaptivemeanshift": {
- runitems.add(new AdaptiveMeanShift());
- break;
- }
+
case "adaptive": {
runitems.add(new RPHashAdaptive2Pass(o));
break;
}
+
+ case "twrp": {
+ runitems.add(new TWRPv2(o));
+ break;
+ }
+
+ case "twrpmergetree": {
+ runitems.add(new TWRPv3(o));
+ break;
+ }
+
+ case "twrpbisect": {
+ runitems.add(new TWRPv4(o));
+ break;
+ }
+
+ case "twrpbest": {
+ runitems.add(new TWRPv5_WCSS(o));
+ break;
+ }
+
+ case "twrpbest_afterpruning": {
+ runitems.add(new TWRPv6_WCSS2(o));
+ break;
+ }
+
+ case "twrpbest_cov": {
+ runitems.add(new TWRPv6_COV(o));
+ break;
+ }
+
+ case "twrpbest_meanvariance": {
+ runitems.add(new TWRPv6_meanVariance(o));
+ break;
+ }
+
+
+
+
case "dummy": {
runitems.add(new DummyClusterer(so));
break;
diff --git a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java
index 013ea27..927bf7f 100644
--- a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java
+++ b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java
@@ -1,8 +1,10 @@
package edu.uc.rphash;
+import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
@@ -16,11 +18,12 @@
import edu.uc.rphash.tests.StatTests;
import edu.uc.rphash.tests.clusterers.Agglomerative3;
import edu.uc.rphash.tests.generators.GenerateData;
+import edu.uc.rphash.util.VectorUtil;
public class RPHashAdaptive2Pass implements Clusterer, Runnable {
- boolean znorm = true;
+ boolean znorm = false;
private int counter;
@@ -63,17 +66,21 @@ float[] medoid(List X) {
//float[] rngvec; the range vector is moot if incoming data has been normalized
//post normalization it should all be zero centered, with variance 1
-
/*
* super simple hash algorithm, reminiscient of pstable lsh
*/
+ // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim.
+
public long hashvec(float[] xt, float[] x,
HashMap> IDAndCent, HashMap> IDAndLabel,int ct) {
- long s = 1;//fixes leading 0's bug
+ long s = 1; //fixes leading 0's bug
for (int i = 0; i < xt.length; i++) {
- s <<= 1;
+// s <<= 1;
+ s = s << 1 ; // left shift the bits of s by 1.
if (xt[i] > rngvec[i])
- s += 1;
+// s += 1;
+ s= s+1;
+
if (IDAndCent.containsKey(s)) {
IDAndLabel.get(s).add(ct);
IDAndCent.get(s).add(x);
@@ -143,16 +150,17 @@ public List> findDensityModes() {
projector.init();
int ct = 0;
- if(znorm == true){
- float[] variance = StatTests.varianceCol(so.getRawData());
- float[] mean = StatTests.meanCols(so.getRawData());
- // #process data by adding to the counter
- for (float[] x : so.getRawData())
- {
- addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance);
- }
- }
- else
+// if(znorm == true){
+// float[] variance = StatTests.varianceCol(so.getRawData());
+// float[] mean = StatTests.meanCols(so.getRawData());
+// // #process data by adding to the counter
+// for (float[] x : so.getRawData())
+// {
+// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance);
+// }
+// }
+//
+// else
{
for (float[] x : so.getRawData())
@@ -161,6 +169,24 @@ public List> findDensityModes() {
}
}
+
+// for (Long name: IDAndCent.keySet()){
+//
+// String key =name.toString();
+// // String value = IDAndCent.get(name).toString() ;
+// // String value1 = Arrays.toString(value.toString());
+// System.out.println(key ) ;//+ " " + value);
+//}
+
+ System.out.println("NumberOfMicroClustersBeforePruning = "+ IDAndCent.size());
+// for (Long name: IDAndID.keySet()){
+// String key =name.toString();
+// String value = IDAndID.get(name).toString();
+// System.out.println(key + " " + value);
+//
+//
+//}
+
// next we want to prune the tree by parent count comparison
// follows breadthfirst search
HashMap denseSetOfIDandCount = new HashMap();
@@ -197,9 +223,11 @@ public List> findDensityModes() {
List sortedIDList= new ArrayList<>();
// sort and limit the list
- stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4)
+ stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*6)
.forEachOrdered(x -> sortedIDList.add(x.getKey()));
+ System.out.println("NumberOfMicroClustersAfterPruning = "+ sortedIDList.size());
+
// compute centroids
HashMap> estcents = new HashMap<>();
@@ -207,12 +235,16 @@ public List> findDensityModes() {
{
estcents.put(sortedIDList.get(i), IDAndCent.get(sortedIDList.get(i)));
}
+
+
// System.out.println();
// for (int i =0; i(estcents.values());
@@ -229,12 +261,14 @@ public void run() {
Listcentroids = new ArrayList<>();
List weights =new ArrayList<>();
- int k = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size();
+ // int k = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size();
+ int k = clustermembers.size();
for(int i=0;i centroids = null;
- private RPHashObject so;
- int threads = 4;
-
- public RPHashAdaptive2PassParallel(RPHashObject so) {
- this.threads = 4;
- this.so = so;
- }
-
- public RPHashAdaptive2PassParallel(List data, int k, int processors) {
- this.threads = processors;
- so = new SimpleArrayReader(data, k);
- }
-
- public List getCentroids(RPHashObject so) {
- this.so = so;
- return getCentroids();
- }
-
- @Override
- public List getCentroids() {
- if (centroids == null)
- run();
- return centroids;
- }
-
- /*
- * X - set of vectors compute the medoid of a vector set
- */
- float[] medoid(List X) {
- float[] ret = X.get(0);
- for (int i = 1; i < X.size(); i++) {
- for (int j = 0; j < ret.length; j++) {
- ret[j] += X.get(i)[j];
- }
- }
- for (int j = 0; j < ret.length; j++) {
- ret[j] = ret[j] / ((float) X.size());
- }
- return ret;
- }
-
- // float[] rngvec; the range vector is moot if incoming data has been
- // normalized
- // post normalization it should all be zero centered, with variance 1
-
- /*
- * super simple hash algorithm, reminiscient of pstable lsh
- */
- public long hashvec(float[] xt, float[] x,
- Map> IDAndCent,
- Map> IDAndLabel, int ct) {
- long s = 1;// fixes leading 0's bug
- for (int i = 0; i < xt.length; i++) {
- s <<= 1;
- if (xt[i] > rngvec[i])
- s += 1;
- if (IDAndCent.containsKey(s)) {
- if (IDAndLabel.get(s) != null)
- IDAndLabel.get(s).add(ct);
- if (IDAndCent.get(s) != null)
- IDAndCent.get(s).add(x);
- } else {
- ArrayList xlist = new ArrayList<>();
- xlist.add(x);
- IDAndCent.put(s, xlist);
- ArrayList idlist = new ArrayList<>();
- idlist.add(ct);
- IDAndLabel.put(s, idlist);
- }
- }
- return s;
- }
-
- /*
- * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid
- * vector map
- *
- * hash the projected vector x and update the hash to centroid and counts
- * maps
- */
- void addtocounter(float[] x, Projector p,
- Map> IDAndCent,
- Map> IDandID, int ct) {
- float[] xt = p.project(x);
-
- hashvec(xt, x, IDAndCent, IDandID, ct);
- }
-
- /*
- * X - data set k - canonical k in k-means l - clustering sub-space Compute
- * density mode via iterative deepening hash counting
- */
- public Collection> findDensityModes()
- throws InterruptedException, ExecutionException {
-
- // #create projector matrixs
- Projector projector = so.getProjectionType();
- projector.setOrigDim(so.getdim());
- projector.setProjectedDim(so.getDimparameter());
- projector.setRandomSeed(so.getRandomSeed());
- projector.init();
-
- // int ct = 1;
-
- List dat = so.getRawData();
-
- //this counter gets shared
- AtomicInteger ct = new AtomicInteger(0);
-
- ForkJoinPool executor = new ForkJoinPool(this.threads);
-
- int chunksize = dat.size() / this.threads;
-
- //This is the array of essentially thread objets that process in parallel
- ArrayList>>> gather = new ArrayList<>(this.threads);
-
- for (int i = 0; i < this.threads; i++) {
-
- int chunk = chunksize* i;
- gather.add(executor.submit(new Callable