diff --git a/.classpath b/.classpath index 29836c6..90af4e4 100644 --- a/.classpath +++ b/.classpath @@ -1,9 +1,16 @@ - - - - - - - - - + + + + + + + + + + + + + + + + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 0000000..98ee0df --- /dev/null +++ b/.pydevproject @@ -0,0 +1,5 @@ + + + Default + python interpreter + diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..d1fb81f --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding//src/main/java/edu/uc/rphash/tests/plotting.java=UTF-8 diff --git a/scripts/ari_test.py b/scripts/ari_test.py new file mode 100644 index 0000000..4aef401 --- /dev/null +++ b/scripts/ari_test.py @@ -0,0 +1,64 @@ +import pandas as pd +import numpy as np +#from scipy.spatial import distance +from math import dist +import os +import csv +import openpyxl +from sklearn.metrics.cluster import adjusted_rand_score + +# https://github.com/cran/dendextend/blob/master/R/find_k.R +# https://cran.r-project.org/web/packages/fpc/fpc.pdf + +# scipy.spatial.distance.euclidean(A, B) +# dist([1, 0, 0], [0, 1, 0]) + +labels_true_gt=np.genfromtxt("C:/Users/dey.sn/Downloads/temp/haraal/haraal_labels_gt.csv", delimiter=',') +print(labels_true_gt.shape[0]) +print(labels_true_gt) +#column = nArr2D[:, 1] +#output_labels = np.genfromtxt('C:/Users/dey.sn/Downloads/work/output/har_k6/Labels_har_k6_kmpp,cutoff,90,k6.csv', delimiter=',') +''' +output_labels_col1=output_labels[:,0] +print(output_labels.shape[1]) +print(output_labels_col1) +for cols in range(output_labels.shape[1]): + print(adjusted_rand_score(labels_true_gt,output_labels[:,cols])) + +''' +# This is the path where you want to search +path = r'C:/Users/dey.sn/Downloads/work/output/haraal_k6/' +# this is the extension you want to detect +extension = '.csv' +substring="Labels" +count=0 +wb=openpyxl.Workbook() +sheet=wb.active +sheet.title= 'haraal_ari' +for root, dirs_list, files_list in os.walk(path): + for file_name in files_list: + if os.path.splitext(file_name)[-1] == extension: + file_name_path = os.path.join(root, file_name) + print(file_name) + print(file_name_path) # This is the full path of the filter file + try: + index=file_name.index(substring) + # print(index) + if(index==0): + count+=1 + output_labels = np.genfromtxt(file_name_path, delimiter=',') + b = sheet.cell(row=count, column=2) + b.value = file_name + for cols in range(output_labels.shape[1]): + ari=adjusted_rand_score(labels_true_gt,output_labels[:,cols]) + print(ari) + c = sheet.cell(row=count, column=(cols+12)) + c.value = ari + except ValueError: + print( + "Not found!") + else: + print( + "Found!") +print(count) +wb.save("C:/Users/dey.sn/Downloads/work/output/haraal_k6/results_python_ari_all_runs.xlsx") diff --git a/scripts/knee_test.py b/scripts/knee_test.py new file mode 100644 index 0000000..f5d1b68 --- /dev/null +++ b/scripts/knee_test.py @@ -0,0 +1,350 @@ +import numpy as np +from scipy import interpolate +from scipy.signal import argrelextrema +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression +import warnings +from typing import Tuple, Optional, Iterable +import matplotlib.pyplot as plt +import pandas as pd + + + + +class KneeLocator(object): + def __init__( + self, + x: Iterable[float], + y: Iterable[float], + S: float = 1.0, + curve: str = "concave", + direction: str = "increasing", + interp_method: str = "interp1d", + online: bool = False, + ): + """ + Once instantiated, this class attempts to find the point of maximum + curvature on a line. The knee is accessible via the `.knee` attribute. + :param x: x values. + :param y: y values. + :param S: Sensitivity, original paper suggests default of 1.0 + :param curve: If 'concave', algorithm will detect knees. If 'convex', it + will detect elbows. + :param direction: one of {"increasing", "decreasing"} + :param interp_method: one of {"interp1d", "polynomial"} + :param online: Will correct old knee points if True, will return first knee if False + """ + # Step 0: Raw Input + self.x = np.array(x) + self.y = np.array(y) + self.curve = curve + self.direction = direction + self.N = len(self.x) + self.S = S + self.all_knees = set() + self.all_norm_knees = set() + self.all_knees_y = [] + self.all_norm_knees_y = [] + self.online = online + + # Step 1: fit a smooth line + if interp_method == "interp1d": + uspline = interpolate.interp1d(self.x, self.y) + self.Ds_y = uspline(self.x) + elif interp_method == "polynomial": + pn_model = PolynomialFeatures(7) + xpn = pn_model.fit_transform(self.x.reshape(-1, 1)) + regr_model = LinearRegression() + regr_model.fit(xpn, self.y) + self.Ds_y = regr_model.predict( + pn_model.fit_transform(self.x.reshape(-1, 1)) + ) + else: + raise ValueError( + "{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format( + interp_method + ) + ) + + # Step 2: normalize values + self.x_normalized = self.__normalize(self.x) + self.y_normalized = self.__normalize(self.Ds_y) + + # Step 3: Calculate the Difference curve + self.x_normalized, self.y_normalized = self.transform_xy( + self.x_normalized, self.y_normalized, self.direction, self.curve + ) + # normalized difference curve + self.y_difference = self.y_normalized - self.x_normalized + self.x_difference = self.x_normalized.copy() + + # Step 4: Identify local maxima/minima + # local maxima + self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0] + self.x_difference_maxima = self.x_difference[self.maxima_indices] + self.y_difference_maxima = self.y_difference[self.maxima_indices] + + # local minima + self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0] + self.x_difference_minima = self.x_difference[self.minima_indices] + self.y_difference_minima = self.y_difference[self.minima_indices] + + # Step 5: Calculate thresholds + self.Tmx = self.y_difference_maxima - ( + self.S * np.abs(np.diff(self.x_normalized).mean()) + ) + + # Step 6: find knee + self.knee, self.norm_knee = self.find_knee() + + # Step 7: If we have a knee, extract data about it + self.knee_y = self.norm_knee_y = None + if self.knee: + self.knee_y = self.y[self.x == self.knee][0] + self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0] + + @staticmethod + def __normalize(a: Iterable[float]) -> Iterable[float]: + """normalize an array + :param a: The array to normalize + """ + return (a - min(a)) / (max(a) - min(a)) + + @staticmethod + def transform_xy( + x: Iterable[float], y: Iterable[float], direction: str, curve: str + ) -> Tuple[Iterable[float], Iterable[float]]: + """transform x and y to concave, increasing based on given direction and curve""" + # convert elbows to knees + if curve == "convex": + x = x.max() - x + y = y.max() - y + # flip decreasing functions to increasing + if direction == "decreasing": + y = np.flip(y, axis=0) + + if curve == "convex": + x = np.flip(x, axis=0) + y = np.flip(y, axis=0) + + return x, y + + def find_knee(self,): + """This function finds and sets the knee value and the normalized knee value. """ + if not self.maxima_indices.size: + warnings.warn( + "No local maxima found in the difference curve\n" + "The line is probably not polynomial, try plotting\n" + "the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n" + "Also check that you aren't mistakenly setting the curve argument", + RuntimeWarning, + ) + return None, None + + # placeholder for which threshold region i is located in. + maxima_threshold_index = 0 + minima_threshold_index = 0 + # traverse the difference curve + for i, x in enumerate(self.x_difference): + # skip points on the curve before the the first local maxima + if i < self.maxima_indices[0]: + continue + + j = i + 1 + + # reached the end of the curve + if x == 1.0: + break + + # if we're at a local max, increment the maxima threshold index and continue + if (self.maxima_indices == i).any(): + threshold = self.Tmx[maxima_threshold_index] + threshold_index = i + maxima_threshold_index += 1 + # values in difference curve are at or after a local minimum + if (self.minima_indices == i).any(): + threshold = 0.0 + minima_threshold_index += 1 + + if self.y_difference[j] < threshold: + if self.curve == "convex": + if self.direction == "decreasing": + knee = self.x[threshold_index] + norm_knee = self.x_normalized[threshold_index] + else: + knee = self.x[-(threshold_index + 1)] + norm_knee = self.x_normalized[-(threshold_index + 1)] + + elif self.curve == "concave": + if self.direction == "decreasing": + knee = self.x[-(threshold_index + 1)] + norm_knee = self.x_normalized[-(threshold_index + 1)] + else: + knee = self.x[threshold_index] + norm_knee = self.x_normalized[threshold_index] + + # add the y value at the knee + y_at_knee = self.y[self.x == knee][0] + y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0] + if knee not in self.all_knees: + self.all_knees_y.append(y_at_knee) + self.all_norm_knees_y.append(y_norm_at_knee) + + # now add the knee + self.all_knees.add(knee) + self.all_norm_knees.add(norm_knee) + + # if detecting in offline mode, return the first knee found + if self.online is False: + return knee, norm_knee + + if self.all_knees == set(): + warnings.warn("No knee/elbow found") + return None, None + + return knee, norm_knee + + def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None): + """Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists. + + :param figsize: Optional[Tuple[int, int] + The figure size of the plot. Example (12, 8) + :return: NoReturn + """ + import matplotlib.pyplot as plt + + if figsize is None: + figsize = (6, 6) + + plt.figure(figsize=figsize) + plt.title("Normalized Knee Point") + plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve") + plt.plot(self.x_difference, self.y_difference, "r", label="difference curve") + plt.xticks( + np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1) + ) + plt.yticks( + np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1) + ) + + plt.vlines( + self.norm_knee, + plt.ylim()[0], + plt.ylim()[1], + linestyles="--", + label="knee/elbow", + ) + plt.legend(loc="best") + + def plot_knee(self, figsize: Optional[Tuple[int, int]] = None): + """ + Plot the curve and the knee, if it exists + + :param figsize: Optional[Tuple[int, int] + The figure size of the plot. Example (12, 8) + :return: NoReturn + """ + import matplotlib.pyplot as plt + + if figsize is None: + figsize = (6, 6) + + plt.figure(figsize=figsize) + plt.title("Knee Point") + plt.plot(self.x, self.y, "b", label="data") + plt.vlines( + self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow" + ) + plt.legend(loc="best") + + # Niceties for users working with elbows rather than knees + @property + def elbow(self): + return self.knee + + @property + def norm_elbow(self): + return self.norm_knee + + @property + def elbow_y(self): + return self.knee_y + + @property + def norm_elbow_y(self): + return self.norm_knee_y + + @property + def all_elbows(self): + return self.all_knees + + @property + def all_norm_elbows(self): + return self.all_norm_knees + + @property + def all_elbows_y(self): + return self.all_knees_y + + @property + def all_norm_elbows_y(self): + return self.all_norm_knees_y + + +## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + +import pandas as pd +import timeit + +#df=pd.read_excel("C:/Users/dey.sn/Downloads/work/output/elbow_graph_stage1_syn_data.xlsx") +#df=pd.read_excel("data.xlsx", sheet_name='har2', header=None, na_values=['NA'], usecols="Aq,at",skiprows=range(97),nrows=6) +df=pd.read_excel("C:/Users/dey.sn/Downloads/work/output/elbow_graph_stage1_syn_data.xlsx", sheet_name='N5%_1000', header=None, na_values=['NA'], usecols="A,y",skiprows=range(3),nrows=99) +#print(df) +conv_arr= df.values +start = timeit.default_timer() + +#split matrix into 3 columns each into 1d array +#print(conv_arr.shape) +#print(conv_arr[1,1]) +arr1 = np.delete(conv_arr,1,axis=1) +arr2 = np.delete(conv_arr,0,axis=1) + +#converting into 1D array +x = arr1.ravel() +y = arr2.ravel() + +kn = KneeLocator(list(x), y , S=0.0, curve='convex', direction='decreasing',online=False ) #,interp_method='polynomial') +stop = timeit.default_timer() +print('Time: ', stop - start) +kn2 = KneeLocator(list(x), y , S=1.0, curve='convex', direction='decreasing',online=False ) +print(kn.knee) +print(kn2.knee) +#print(kn.norm_knee) + +plt.style.use('ggplot') +plt.plot() +plt.xlabel('K (no. of clusters) ') +plt.ylabel('WCSSE') +#plt.title('Elbow method for optimal k.[data=HAR, k=4, Pred. k= %d]' %(kn.knee)) +plt.suptitle('Elbow Method For Optimal Cluster Determination [data=Noise_30_percent, K=10, Pred.K = %d]' %(kn.knee),x=0.5, y=0.000, ha="center" , va="bottom") +plt.plot(x, y, 'bx-') +#plt.xscale('log') +plt.grid(True) +plt.xticks() +plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') +plt.savefig("C:/Users/dey.sn/Downloads/work/output/N30%_1000_graph_s0.pdf") +plt.show() + +plt.style.use('ggplot') +plt.plot() +plt.xlabel('Buckets') +plt.ylabel('Counts') +plt.title('Elbow method for optimal k. [data=Noise_30_percent, K=10, Pred.K = %d]' %(kn2.knee)) +plt.plot(x, y, 'bx-') +#plt.xscale('log') +plt.grid(True) +plt.xticks() +plt.vlines(kn2.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') +plt.savefig("C:/Users/dey.sn/Downloads/work/output/N30%_1000_graph_s1.pdf") +plt.show() \ No newline at end of file diff --git a/scripts/measures_wcss.py b/scripts/measures_wcss.py new file mode 100644 index 0000000..17d4bdb --- /dev/null +++ b/scripts/measures_wcss.py @@ -0,0 +1,87 @@ +import pandas as pd +import numpy as np +#from scipy.spatial import distance +from math import dist +import os +import csv +import openpyxl +from sklearn.metrics.cluster import adjusted_rand_score + +# https://github.com/cran/dendextend/blob/master/R/find_k.R +# https://cran.r-project.org/web/packages/fpc/fpc.pdf + +# scipy.spatial.distance.euclidean(A, B) +# dist([1, 0, 0], [0, 1, 0]) + +data=np.genfromtxt("C:/Users/dey.sn/Downloads/temp/haraal/2d.csv", delimiter=',') +print(data.shape[0]) +#print(data[10298]) +vectors=data.shape[0] + +# This is the path where you want to search +path = r'C:/Users/dey.sn/Downloads/work/output/haraal_k6/' +# this is the extension you want to detect +extension = '.csv' +substring="haraal_k6" +count=0 +wb=openpyxl.Workbook() +sheet=wb.active +sheet.title= 'haraal' +for root, dirs_list, files_list in os.walk(path): + for file_name in files_list: + if os.path.splitext(file_name)[-1] == extension: + file_name_path = os.path.join(root, file_name) + print(file_name) + print(file_name_path) # This is the full path of the filter file + try: + index=file_name.index(substring) + # print(index) + if(index==0): + count+=1 + centarr = np.genfromtxt(file_name_path, delimiter=',') + b = sheet.cell(row=count, column=2) + b.value = file_name +# centarr = np.genfromtxt('C:/Users/dey.sn/Downloads/work/output/har_k6/har_k6_kmeans_120cutoff _4_2.csv', delimiter=',') +# print(np.shape(centarr)) +# print(centarr[0],centarr[1]) + index = 2 + row=int(centarr[0]) # number of centroids + col=int(centarr[1]) + cents=[] + for i in range(row): + c1=[] + for j in range(col): + c1.append(centarr[index]) + index += 1 + cents.append(c1) + +# print(cents[2]) +# print(np.shape(cents)) + + wcss1=0 + for i in range (vectors): + distance1 = [] + for j in range(row): +# print(j) + d1=(dist(data[i], cents[j])) + #print(d1) + distance1.append(d1) + + print(distance1) + mindist=min(distance1) + print(mindist) + + wcss1= int(wcss1 + (mindist*mindist)) + + print("wcss1 is : " , (wcss1)) + + c = sheet.cell(row=count, column=12) + c.value = wcss1 + except ValueError: + print + "Not found!" + else: + print + "Found!" +print(count) +wb.save("C:/Users/dey.sn/Downloads/work/output/haraal_k6/results_python_wcss_all_runs.xlsx") diff --git a/src/main/java/edu/uc/rphash/Centroid.java b/src/main/java/edu/uc/rphash/Centroid.java index 8ffaa6d..20fc768 100644 --- a/src/main/java/edu/uc/rphash/Centroid.java +++ b/src/main/java/edu/uc/rphash/Centroid.java @@ -1,6 +1,7 @@ package edu.uc.rphash; import java.util.ArrayList; +import java.util.List; import java.util.concurrent.ConcurrentSkipListSet; import java.util.stream.Collector; import java.util.stream.Collectors; @@ -233,5 +234,21 @@ public int compareTo(Centroid o) { return (int) (o.id - this.id); } + + + + + public static void removeallobjects(List DB) { + // float[] tmp; + for (int i = 0; i < DB.size(); i++) { + //tmp = DB.get(i).centroid(); + DB.remove(i); + + } + + + } + + } diff --git a/src/main/java/edu/uc/rphash/Dis_PPAHStream.java b/src/main/java/edu/uc/rphash/Dis_PPAHStream.java new file mode 100644 index 0000000..7630571 --- /dev/null +++ b/src/main/java/edu/uc/rphash/Dis_PPAHStream.java @@ -0,0 +1,371 @@ +package edu.uc.rphash; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.decoders.Decoder; +import edu.uc.rphash.decoders.Spherical; +import edu.uc.rphash.frequentItemSet.ItemSet; +import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; +import edu.uc.rphash.lsh.LSH; +//import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.standardhash.HashAlgorithm; +import edu.uc.rphash.standardhash.NoHash; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +public class Dis_PPAHStream implements Clusterer { + // float variance; + + public ItemSet is; + + List labels; + HashMap labelmap; + + private int processors = 1; + + public static long mapfunc(float[] vec, LSH lshfunc) { + + return lshfunc.lshHash(vec); + + } + + public RPHashObject mapreduce1() { + + //------------This is Setup Code------------- + // create our LSH Machine + HashAlgorithm hal = new NoHash(so.getHashmod()); + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + + int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and + // round to + // integer + int k = so.getk() * logk; + is = new SimpleFrequentItemSet(k); + Decoder dec = so.getDecoderType(); + dec.setCounter(is); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + // no noise to start with + List noise = LSH.genNoiseTable( + dec.getDimensionality(), + so.getNumBlur(), + new Random(), + dec.getErrorRadius() + / (dec.getDimensionality() * dec.getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + + // add to frequent itemset the hashed Decoded randomly projected vector + + + List dat = so.getRawData(); + + //Dey + //------------------------- + //------------This is the actual map function------------- + + //this is the actual map + ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors ); + try { + forkJoinPool.submit(() -> + dat.parallelStream().map(s->mapfunc(s,lshfunc)).forEach(s->is.add(s)) + ).get(); + } catch (ExecutionException|InterruptedException e) { + e.printStackTrace(); + } + forkJoinPool.shutdown(); + + //------------------------- + + + //------------This is clean up code------------- + List topids = is.getTop(); + so.setPreviousTopID(topids); + + List topsizes = is.getCounts(); + + + // this is where the parallel reduce function would be + // to sum up the counts that correspond to hash_ids + // so very much the word count example + List countsAsFloats = new ArrayList(); + for (long ct : topsizes) + countsAsFloats.add((float) ct); + so.setCounts(countsAsFloats); + return so; + } + + public static void redFunc(float[] vec, LSH lshfunc, List noise, + List labels, List centroids) { + long[] hash = lshfunc.lshHashRadius(vec, noise); + labels.add(-1l); + // radius probe around the vector + for (Centroid cent : centroids) { + for (long h : hash) { + if (cent.ids.contains(h)) { + cent.updateVec(vec); + labels.set(labels.size() - 1, cent.id); + } + } + } + } + + public static long[] redFunc(float[] vec, LSH lshfunc, List noise) { + return lshfunc.lshHashRadius(vec, noise); + } + + /* + * This is the second phase after the top ids have been in the reduce phase + * aggregated + */ + public RPHashObject mapreduce2() { + + //------------This is Setup Code------------- + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + float[] vec = vecs.next(); + + HashAlgorithm hal = new NoHash(so.getHashmod()); + Decoder dec = so.getDecoderType(); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + + List noise = LSH.genNoiseTable( + so.getdim(), + so.getNumBlur(), + new Random(so.getRandomSeed()), + (float) (dec.getErrorRadius()) + / (float) (dec.getDimensionality() * dec + .getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + ArrayList centroids = new ArrayList(); + + for (long id : so.getPreviousTopID()) { + centroids.add(new Centroid(so.getdim(), id, -1)); + } + + //DEY + //------------------------------------------------- + //------------This is the parallel map------------- + + List dat = so.getRawData(); + + + ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors ); + try { + //parallel map + forkJoinPool.submit(() -> + dat.parallelStream().map(s->redFunc(s,lshfunc,noise)).forEach(hashes -> { + //end parallel map + + //parallel reduce + //local centroids is what would need to be implemented + // to update in parallel in each node + // currently this thing shares the centroids list, which is a bottleneck + // the reducer would need to use this to reduce centroids with the same id + // Centroid.merge(ctcent1, cent1,wcsscent1,ctcent2, cent2,wcsscent2); +// List localcentroids = centroids.stream().map(Centroid::new).collect(Centroid.toArrayList()); + for (Centroid cent : centroids) { + for (long h : hashes) + { + if (cent.ids.contains(h)) + { + cent.updateVec(vec); + } + } + } + })).get(); + } catch (InterruptedException|ExecutionException e) { + e.printStackTrace(); + } + + forkJoinPool.shutdown(); + //------------------------------------------------- + + //------------This is the cleanup code------------- + //Sequential + + Clusterer offlineclusterer = new KMeans2();//so.getOfflineClusterer(); + offlineclusterer.setData(centroids.stream().collect(Centroid.toArrayList())); + offlineclusterer.setWeights(so.getCounts()); + offlineclusterer.setK(so.getk()); + +// this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids); + so.setCentroids(offlineclusterer.getCentroids()); + return so; + } + + // 271458 + // 264779.7 + + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + private List centroids = null; + private RPHashObject so; + + public Dis_PPAHStream(List data, int k) { + so = new SimpleArrayReader(data, k); + } + +// int threads = 1; + + public Dis_PPAHStream(List data, int k, int processors) { + + this.processors = processors; + so = new SimpleArrayReader(data, k); + so.setParallel(true); + } + + public Dis_PPAHStream(List data, int k, int times, int rseed) { + so = new SimpleArrayReader(data, k); + } + + public Dis_PPAHStream(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + if (centroids == null) + run(); + return centroids; + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + + return centroids; + } + + private void run() { + mapreduce1(); + mapreduce2(); + //this.centroids = so.getCentroids(); + } + + public static void main(String[] args) { + int k = 10; + int d = 1000; + int n = 10000; + float var = 1f; + int count = 5; + System.out.printf("Decoder: %s\n", "Sphere"); + System.out.printf("ClusterVar\t"); + for (int i = 0; i < count; i++) + System.out.printf("Trial%d\t", i); + System.out.printf("RealWCSS\n"); + + for (float f = var; f < 3.01; f += .05f) { + float avgrealwcss = 0; + float avgtime = 0; + System.out.printf("%f\t", f); + for (int i = 0; i < count; i++) { + GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); + RPHashObject o = new SimpleArrayReader(gen.data, k); + Dis_PPAHStream rphit = new Dis_PPAHStream(o); +// rphit.threads = 4; + o.setDecoderType(new Spherical(32, 4, 1)); + // o.setDimparameter(31); + o.setOfflineClusterer(new KMeans2()); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + avgtime += (System.nanoTime() - startTime) / 100000000; + + // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + // gen.getData()); + + // System.out.printf("%.0f\t", + // StatTests.WCSSECentroidsFloat(centsr, gen.data)); + // System.gc(); + + } + System.out.printf("%.0f\n", avgrealwcss / count); + } + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + // TODO Auto-generated method stub + + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return true; + } +} diff --git a/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java b/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java new file mode 100644 index 0000000..75614a0 --- /dev/null +++ b/src/main/java/edu/uc/rphash/Dis_PRPHashStream.java @@ -0,0 +1,371 @@ +package edu.uc.rphash; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.decoders.Decoder; +import edu.uc.rphash.decoders.Spherical; +import edu.uc.rphash.frequentItemSet.ItemSet; +import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; +import edu.uc.rphash.lsh.LSH; +//import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.standardhash.HashAlgorithm; +import edu.uc.rphash.standardhash.NoHash; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +public class Dis_PRPHashStream implements Clusterer { + // float variance; + + public ItemSet is; + + List labels; + HashMap labelmap; + + private int processors = 1; + + public static long mapfunc(float[] vec, LSH lshfunc) { + + return lshfunc.lshHash(vec); + + } + + public RPHashObject mapreduce1() { + + //------------This is Setup Code------------- + // create our LSH Machine + HashAlgorithm hal = new NoHash(so.getHashmod()); + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + + int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and + // round to + // integer + int k = so.getk() * logk; + is = new SimpleFrequentItemSet(k); + Decoder dec = so.getDecoderType(); + dec.setCounter(is); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + // no noise to start with + List noise = LSH.genNoiseTable( + dec.getDimensionality(), + so.getNumBlur(), + new Random(), + dec.getErrorRadius() + / (dec.getDimensionality() * dec.getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + + // add to frequent itemset the hashed Decoded randomly projected vector + + + List dat = so.getRawData(); + + //Dey + //------------------------- + //------------This is the actual map function------------- + + //this is the actual map + ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors ); + try { + forkJoinPool.submit(() -> + dat.parallelStream().map(s->mapfunc(s,lshfunc)).forEach(s->is.add(s)) + ).get(); + } catch (ExecutionException|InterruptedException e) { + e.printStackTrace(); + } + forkJoinPool.shutdown(); + + //------------------------- + + + //------------This is clean up code------------- + List topids = is.getTop(); + so.setPreviousTopID(topids); + + List topsizes = is.getCounts(); + + + // this is where the parallel reduce function would be + // to sum up the counts that correspond to hash_ids + // so very much the word count example + List countsAsFloats = new ArrayList(); + for (long ct : topsizes) + countsAsFloats.add((float) ct); + so.setCounts(countsAsFloats); + return so; + } + + public static void redFunc(float[] vec, LSH lshfunc, List noise, + List labels, List centroids) { + long[] hash = lshfunc.lshHashRadius(vec, noise); + labels.add(-1l); + // radius probe around the vector + for (Centroid cent : centroids) { + for (long h : hash) { + if (cent.ids.contains(h)) { + cent.updateVec(vec); + labels.set(labels.size() - 1, cent.id); + } + } + } + } + + public static long[] redFunc(float[] vec, LSH lshfunc, List noise) { + return lshfunc.lshHashRadius(vec, noise); + } + + /* + * This is the second phase after the top ids have been in the reduce phase + * aggregated + */ + public RPHashObject mapreduce2() { + + //------------This is Setup Code------------- + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + float[] vec = vecs.next(); + + HashAlgorithm hal = new NoHash(so.getHashmod()); + Decoder dec = so.getDecoderType(); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + + List noise = LSH.genNoiseTable( + so.getdim(), + so.getNumBlur(), + new Random(so.getRandomSeed()), + (float) (dec.getErrorRadius()) + / (float) (dec.getDimensionality() * dec + .getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + ArrayList centroids = new ArrayList(); + + for (long id : so.getPreviousTopID()) { + centroids.add(new Centroid(so.getdim(), id, -1)); + } + + //DEY + //------------------------------------------------- + //------------This is the parallel map------------- + + List dat = so.getRawData(); + + + ForkJoinPool forkJoinPool = new ForkJoinPool(this.processors ); + try { + //parallel map + forkJoinPool.submit(() -> + dat.parallelStream().map(s->redFunc(s,lshfunc,noise)).forEach(hashes -> { + //end parallel map + + //parallel reduce + //local centroids is what would need to be implemented + // to update in parallel in each node + // currently this thing shares the centroids list, which is a bottleneck + // the reducer would need to use this to reduce centroids with the same id + // Centroid.merge(ctcent1, cent1,wcsscent1,ctcent2, cent2,wcsscent2); +// List localcentroids = centroids.stream().map(Centroid::new).collect(Centroid.toArrayList()); + for (Centroid cent : centroids) { + for (long h : hashes) + { + if (cent.ids.contains(h)) + { + cent.updateVec(vec); + } + } + } + })).get(); + } catch (InterruptedException|ExecutionException e) { + e.printStackTrace(); + } + + forkJoinPool.shutdown(); + //------------------------------------------------- + + //------------This is the cleanup code------------- + //Sequential + + Clusterer offlineclusterer = new KMeans2();//so.getOfflineClusterer(); + offlineclusterer.setData(centroids.stream().collect(Centroid.toArrayList())); + offlineclusterer.setWeights(so.getCounts()); + offlineclusterer.setK(so.getk()); + +// this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids); + so.setCentroids(offlineclusterer.getCentroids()); + return so; + } + + // 271458 + // 264779.7 + + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + private List centroids = null; + private RPHashObject so; + + public Dis_PRPHashStream(List data, int k) { + so = new SimpleArrayReader(data, k); + } + +// int threads = 1; + + public Dis_PRPHashStream(List data, int k, int processors) { + + this.processors = processors; + so = new SimpleArrayReader(data, k); + so.setParallel(true); + } + + public Dis_PRPHashStream(List data, int k, int times, int rseed) { + so = new SimpleArrayReader(data, k); + } + + public Dis_PRPHashStream(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + if (centroids == null) + run(); + return centroids; + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + + return centroids; + } + + private void run() { + mapreduce1(); + mapreduce2(); + //this.centroids = so.getCentroids(); + } + + public static void main(String[] args) { + int k = 10; + int d = 1000; + int n = 10000; + float var = 1f; + int count = 5; + System.out.printf("Decoder: %s\n", "Sphere"); + System.out.printf("ClusterVar\t"); + for (int i = 0; i < count; i++) + System.out.printf("Trial%d\t", i); + System.out.printf("RealWCSS\n"); + + for (float f = var; f < 3.01; f += .05f) { + float avgrealwcss = 0; + float avgtime = 0; + System.out.printf("%f\t", f); + for (int i = 0; i < count; i++) { + GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); + RPHashObject o = new SimpleArrayReader(gen.data, k); + Dis_PRPHashStream rphit = new Dis_PRPHashStream(o); +// rphit.threads = 4; + o.setDecoderType(new Spherical(32, 4, 1)); + // o.setDimparameter(31); + o.setOfflineClusterer(new KMeans2()); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + avgtime += (System.nanoTime() - startTime) / 100000000; + + // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + // gen.getData()); + + // System.out.printf("%.0f\t", + // StatTests.WCSSECentroidsFloat(centsr, gen.data)); + // System.gc(); + + } + System.out.printf("%.0f\n", avgrealwcss / count); + } + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + // TODO Auto-generated method stub + + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return true; + } +} diff --git a/src/main/java/edu/uc/rphash/PPAHStream.java b/src/main/java/edu/uc/rphash/PPAHStream.java new file mode 100644 index 0000000..3970bc1 --- /dev/null +++ b/src/main/java/edu/uc/rphash/PPAHStream.java @@ -0,0 +1,810 @@ +package edu.uc.rphash; + +// This class will run the Parameter-free Projected Adaptive Hash Stream Clustering + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; +import java.util.Collections; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; +import edu.uc.rphash.tests.clusterers.DBScan; +import edu.uc.rphash.tests.clusterers.MultiKMPP; + + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + +// https://www.javatips.net/api/webofneeds-master/webofneeds/won-matcher-solr/src/main/java/won/matcher/solr/utils/Kneedle.java +// https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java + +// this algorithm runs twrp 3 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. + + +public class PPAHStream implements Clusterer, Runnable { + + + List labels; // to directly output labels + HashMap labelmap; // to directly output labels + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + + + boolean znorm = false; + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float eps; + + private List centroids = null; + + private RPHashObject so; + + public PPAHStream(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } +// float wcss = (distancesq(x_r,x_2)/cnt_r) + wcss_1; +// float wcss = ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_1); + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + public void printHashmap(HashMap hashmap) { + + System.out.println(hashmap.keySet()); + System.out.println(hashmap.values()); + + } +public void printStream(Stream> stream) { + + //System.out.println(hashmap.keySet()); + System.out.println(stream.count()); + +} +// this method calculates the epsilon value and prints the information. +public float printInfo(ListsetofKeys, HashMap MapOfIDAndCount, HashMap MapOfIDAndCent, HashMap MapOfIDAndWCSS) { + + List counts = new ArrayList<>(); +// List wcsseprint = new ArrayList<>(); + List wcsseprint = new ArrayList<>(); +// float temp = 0; + int elements=0; + float avg=0; + + for (Long keys: setofKeys) + { + elements=elements+1; +//// System.out.println(MapOfIDAndCount.get(keys)); + counts.add(MapOfIDAndCount.get(keys)); + wcsseprint.add(MapOfIDAndWCSS.get(keys).longValue()); + + } +// System.out.println(); + System.out.print(counts); + + + +// for (Long keys: setofKeys) +// { +// System.out.println(MapOfIDAndWCSS.get(keys)); +// wcsseprint.add(MapOfIDAndWCSS.get(keys)); +// } + + // calculation of epsilon + /* + for (int i=0 ; i<(0.8*elements); i++) //for (int i=0 ; i<(0.8*elements); i++) + { + temp = temp + (wcsseprint.get(i))/(counts.get(i)); + } + avg = (float) (temp/(0.8*elements)); + System.out.println(); + System.out.println("\taverage epsilon = "+ avg); + */ + Collections.sort(wcsseprint); + Collections.reverse(wcsseprint); + System.out.println(); + System.out.println(wcsseprint); + System.out.println(); + + JythonTest elbowcalculator = new JythonTest(); + int num_of_clusters= elbowcalculator.find_elbow(wcsseprint); + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); + System.out.println("\n" + "No. of clusters_by_WCSS = " + num_of_clusters); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); + System.out.println( "************************************************************" ); + + + return (avg); + } + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + + projector.setRandomSeed(so.getRandomSeed()); + //projector.setRandomSeed(949124732); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + int ct2 = 0; + int ct3 =0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct2++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct3++, rngvec3,MapOfIDAndWCSS3); + + } + } + + System.out.println("\nNumberOfVectors = , "+ ct); + System.out.println("\nNumberOfMicroClustersBeforePruning = , "+ MapOfIDAndCent1.size()); + //printHashmap(MapOfIDAndCount1); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + //if (cur_id >so.getk()){ + if (cur_id > Long.valueOf(3)){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + + //if (cur_id >so.getk()){ + if (cur_id > Long.valueOf(7)){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + //if (cur_id >so.getk()){ + if (cur_id > Long.valueOf(11)){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + // printHashmap(denseSetOfIDandCount2_1); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + //printHashmap(denseSetOfIDandCount2_2); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + //printHashmap(denseSetOfIDandCount2_3); + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + HashMap denseSetOfIDandCount2 = new HashMap(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS1); +// System.out.println(" wcss_ofline_calc_1 = " + WCSS_off_1); + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS2); +// System.out.println(" wcss_ofline_calc_2 = " + WCSS_off_2); + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS3); +// System.out.println(" wcss_ofline_calc_3 = " + WCSS_off_3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClusters_AfterPruning_&_beforesortingLimit = , "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 2); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + System.out.println("------------------------------------------------------------------------------------------------------------------"); + //printHashmap(denseSetOfIDandCount2); + float eps= printInfo(sortedIDList2,denseSetOfIDandCount2, MapOfIDAndCent,MapOfIDAndWCSS); +// seteps(eps); + + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + + Random r = new Random(); + //Random r = new Random(923063597592675214L) ; + Random r2 = new Random(); + //Random r2 = new Random(923063597592675214L) ; + Random r3 = new Random(); + //Random r3 = new Random(923063597592675214L) ; + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) { + rngvec[i] = (float) r.nextGaussian(); + //System.out.println(rngvec[i]); + } + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + System.out.println("\tNumberOfMicroClusters_AfterPruning = , "+ WeightAndClusters.size()); +// System.out.println("getRandomVector = "+ randVect); + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); +// aggloOffline2.setWeights(weights2); + this.centroids = aggloOffline2.getCentroids(); + +// MultiKMPP aggloOffline3 = new MultiKMPP(centroids2,so.getk()); +// this.centroids = aggloOffline3.getCentroids(); + +//// DBScan algo = new DBScan(centroids2, (eps/(20)), 3); +//// System.out.println("epsssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss = "+ eps/(20)); +//// this.centroids = algo.getCentroids(); +//// System.out.println("no. of final output centroids = "+ centroids.size()); + + } + + public static void main(String[] args) throws FileNotFoundException, + IOException, InterruptedException { + + System.gc(); + + // int k ; //= 10; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + // String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + + // float f = var; + // float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/C:/Users/deysn/Desktop/temp/run_results/3runs/rough/1D.txt")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; C:/Users/deysn/Documents/temp/covtype/1D.txt + // C:/Users/dey.sn/Downloads/temp/covtype/1D.csv ; "C:/Users/dey.sn/Downloads/temp/run_results/3runs/har_k6/1D.txt" + //String inputfile = "C:/Users/dey.sn/Downloads/temp/crop_mapping/1D.csv" ; + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/sensorless_drive/1D.csv" ; + System.out.println(inputfile); + data = VectorUtil.readFile( inputfile , raw); + for (int k=10; k<=10 ;k++) + { + for (int i = 1; i <= 5; i++) + { + //k = 7; + + RPHashObject o = new SimpleArrayReader(data, k); + + o.setDimparameter(16); + o.setCutoff(250); //230 + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + +// TWRPv6_wcss_offline2_TEST rphit = new TWRPv6_wcss_offline2_TEST(o); + PPAHStream rphit = new PPAHStream(o); + + + System.gc(); + + Runtime rt = Runtime.getRuntime(); + rt.gc(); + Thread.sleep(10); + rt.gc(); + long startmemory = rt.totalMemory() - rt.freeMemory(); + long startTime = System.nanoTime(); + + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 1000000000f ; + + float usedMB = ((rt.totalMemory() - rt.freeMemory()) - startmemory) / (1024*1024); + + System.out.println(" Time(in sec), " + avgtime + ", Mem_Used(MB):, " + (usedMB/3) ); + + rt.gc(); + Thread.sleep(10); + rt.gc(); + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); +// String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + //String Output = "C:/Users/dey.sn/Downloads/work/output/cropmap_k7/cropmap_k7_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ; + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// VectorUtil.writeVectorFile(new File(Output+"_"+"labels"+".txt"), centsr.getLabels()); + + +// System.out.printf("WCSS for generated data = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf(",WCSS for Winning Kmeans, = , "+ "%.0f ", StatTests.WCSSECentroidsFloat(centsr, data)); + System.out.println(",k, is: , "+k); +// + System.gc(); + } + } + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + public void seteps(float eps) { + this.eps=eps; + } +} diff --git a/src/main/java/edu/uc/rphash/PPAHStream_v2.java b/src/main/java/edu/uc/rphash/PPAHStream_v2.java new file mode 100644 index 0000000..8082479 --- /dev/null +++ b/src/main/java/edu/uc/rphash/PPAHStream_v2.java @@ -0,0 +1,320 @@ +package edu.uc.rphash; + +/* + This class will run the Parameter-free Projected Adaptive Hash Stream Clustering + */ +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; +import java.util.Map.Entry; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.generators.GenerateStreamData; + + + + +public class PPAHStream_v2 implements StreamClusterer { + + + private float[] rngvec; + private List centroids = null; + private RPHashObject so; + // #create projector matrixs + Projector projector ; + int ct=0; + int pdim = 20; + + public PPAHStream_v2(int k, GenerateStreamData gen, int i) { + so = new SimpleArrayReader(gen,k); + projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(pdim); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + initTablesWith(); + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + /** Add vector to running Centroid + * @param cnt_1,cnt_2 + * @param x_1 + */ + public static float[] update_cent(int ct, float[] x, float[] cent){ + for(int i=0;i 0) + s += 1; + addcent(s,x); + } + return s; + } + + + /* + * ===========================MinCount Sketch======================= + */ + public static final long PRIME_MODULUS = (1L << 31) - 1; + private int depth; + private int width; + private int[][] tableS; + private float[][][] tableCent; + private long[] hashA; + + + private void initTablesWith() { + this.width = (int) Math.ceil(2 / .025); + this.depth = (int) Math.ceil(-Math.log(1 - .97) / Math.log(2)); + this.tableS = new int[depth][width]; + this.tableCent = new float[depth][width][];//we will fill these in as we need them + this.hashA = new long[depth];//hash offsets + Random r = new Random(); + for (int i = 0; i < depth; ++i) { + hashA[i] = r.nextLong(); + } + } + + private int hash(long item, int i) { + long hash = hashA[i] * item; + hash += hash >>> 32; + hash &= PRIME_MODULUS; + return (int) (hash % width); + + } + + private int count(long lshhash) { + int min = (int) tableS[0][hash(lshhash, 0)]; + for (int i = 1; i < depth; ++i) { + if (tableS[i][hash(lshhash, i)] < min) + min = (int) tableS[i][hash(lshhash, i)]; + } + return min; + } + + private float[] get_cent_sketch(long lshhash) { + int min = (int) tableS[0][hash(lshhash, 0)]; + int mini = 0; + int minhtmp = 0; + for (int i = 1; i < depth; ++i) { + int htmp = hash(lshhash, i); + if (tableS[i][hash(lshhash, i)] < min){ + mini = i; + minhtmp = htmp; + min = (int) tableS[i][htmp]; + } + } + + return tableCent[mini][minhtmp]; + } + + private void addcent(long lshhash, float[] x){ + + int htmp = hash(lshhash, 0); + int argmini = 0; + int argminhtmp = htmp; + + tableS[0][htmp] += 1; + int min = (int) tableS[0][htmp]; + + for (int i = 1; i < depth; ++i) { + htmp = hash(lshhash, i); + tableS[i][htmp] += 1; + + if (tableS[i][htmp] < min){ + min = (int) tableS[i][htmp]; + argmini = i; + argminhtmp = htmp; + } + } + + if(tableCent[argmini][argminhtmp]==null){ + tableCent[argmini][argminhtmp] = x; + } + else{ + update_cent(min, x, tableCent[argmini][argminhtmp]); + } + } + /* + * ===========================MinCount Sketch======================= + */ + + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p) { + float[] xt = p.project(x); + hashvec(xt, x); + } + + @Override + public long addVectorOnlineStep(float[] x) { + addtocounter(x, projector); + return 0; + } + + @Override + public List getCentroidsOfflineStep() { + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + HashMap densityAndID = new HashMap(); + for (Long cur_id =0l;cur_id<2<>> 1; + long parent_count = count(parent_id); + + if (2 * cur_count > parent_count) { + densityAndID.put(parent_id, 0l); + densityAndID.put(cur_id,cur_count); + } + } + + //remove keys with support less than 2 + Stream> stream = densityAndID.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + + List sortedIDList= new ArrayList<>(); + // sort and limit the list + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*1) + .forEachOrdered(x -> sortedIDList.add(x.getKey())); + + // compute centroids + List estcents = new ArrayList<>(); + for (int i = 0; i < sortedIDList.size(); i++) { + System.out.println(densityAndID.get(sortedIDList.get(i))); + if(get_cent_sketch(sortedIDList.get(i))!=null) + estcents.add(new Centroid( get_cent_sketch(sortedIDList.get(i)))); + } + + return estcents; + } + + @Override + public void shutdown() { + } + + @Override + public int getProcessors() { + return 0; + } + + @Override + public List getCentroids() { + return null; + } + + + public static void main(String[] args) throws Exception { + + int k = 10; + int d = 100; + int interval = 1000; + float var = 1f; + + Runtime rt = Runtime.getRuntime(); + GenerateStreamData gen = new GenerateStreamData(k, d, var, 1133131); + + StreamClusterer rphit = new PPAHStream_v2(k, gen, 1); + //StreamClusterer rphit = new RPHashStreaming(k, gen, 1); + + ArrayList vecsInThisRound = new ArrayList(); + + System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\n"); + long timestart = System.nanoTime(); + for (int i = 0; i < interval * 6; i++) { + vecsInThisRound.add(gen.generateNext()); + if (i % interval == interval - 1) { + timestart = System.nanoTime(); + for (float[] f : vecsInThisRound) { + rphit.addVectorOnlineStep(f); + } + + List cents = rphit.getCentroidsOfflineStep(); + long time = System.nanoTime() - timestart; + rt.gc(); + long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; + double wcsse = StatTests.WCSSECentroidsFloat(cents, vecsInThisRound); + vecsInThisRound = new ArrayList(); + System.out.printf("%d\t%d\t%.4f\t%.4f\n", i, usedkB, + time / 1000000000f, wcsse); + } + } + } + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + // TODO Auto-generated method stub + return false; + } + +} diff --git a/src/main/java/edu/uc/rphash/PRPHashStream.java b/src/main/java/edu/uc/rphash/PRPHashStream.java new file mode 100644 index 0000000..a906431 --- /dev/null +++ b/src/main/java/edu/uc/rphash/PRPHashStream.java @@ -0,0 +1,279 @@ +package edu.uc.rphash; +/* +This class will run the Parameter-free Random Projection Hash Stream Clustering +*/ +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import java.util.concurrent.TimeUnit; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.concurrent.VectorLevelConcurrency; +import edu.uc.rphash.decoders.Decoder; +import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; +//import edu.uc.rphash.frequentItemSet.KHHCountMinSketch.Tuple; +import edu.uc.rphash.lsh.LSH; +//import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.standardhash.HashAlgorithm; +import edu.uc.rphash.standardhash.MurmurHash; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.KMeansPlusPlus; +import edu.uc.rphash.tests.generators.ClusterGenerator; +import edu.uc.rphash.tests.generators.GenerateStreamData; + +public class PRPHashStream implements StreamClusterer { + public List is; + public List lshfuncs; + private StatTests vartracker; + private List> centroids = null; + private List bestcentroids = null; + private RPHashObject so; + ExecutorService executor; + private final int processors; + private int concurrentRuns; + + boolean initialized=false; + @Override + public int getProcessors() { + return processors; + } + + @Override + public long addVectorOnlineStep(final float[] vec) { + if(!initialized){ + System.out.println("Not initialized!"); + try { + Thread.sleep(100); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + for(int i = 0;i(concurrentRuns); + lshfuncs = new ArrayList(concurrentRuns); + for(int i = 0;i noise = LSH.genNoiseTable(dec.getDimensionality(), + so.getNumBlur(), r, dec.getErrorRadius() + / dec.getDimensionality()); + lshfunc[projidx] = new LSH(dec, p, hal, noise,so.getNormalize()); + } + lshfuncs.add(lshfunc); + } + initialized = true; + } + + public PRPHashStream(int k, ClusterGenerator c) { + so = new SimpleArrayReader(c, k); + if (so.getParallel()) + this.processors = Runtime.getRuntime().availableProcessors(); + else + this.processors = 1; + executor = Executors.newFixedThreadPool(this.processors ); + init(); + } + + public PRPHashStream(List data, int k) { + so = new SimpleArrayReader(data, k); + if (so.getParallel()) + this.processors = Runtime.getRuntime().availableProcessors(); + else + this.processors = 1; + executor = Executors.newFixedThreadPool(this.processors ); + init(); + } + + public PRPHashStream(RPHashObject so) { + this.so = so; + if (so.getParallel()) + this.processors = Runtime.getRuntime().availableProcessors(); + else + this.processors = 1; + executor = Executors.newFixedThreadPool(this.processors ); + init(); + } + + public PRPHashStream(int k, GenerateStreamData c, int processors) { + so = new SimpleArrayReader(c, k); + if (so.getParallel()) + this.processors = processors; + else + this.processors = 1; + executor = Executors.newFixedThreadPool(this.processors ); + init(); + } + + @Override + public List getCentroids() { + if (centroids == null) { + init(); + run(); + getCentroidsOfflineStep(); + } + return bestcentroids; + } + + public List getCentroidsOfflineStep() { + if (so.getParallel()) { + executor.shutdown(); + try { + executor.awaitTermination(10, TimeUnit.SECONDS); + } catch (InterruptedException e) { + e.printStackTrace(); + } + executor = Executors.newFixedThreadPool(this.processors); + } + + bestcentroids = new ArrayList(); +// List projIDs = new ArrayList(); +// List cents = is.getTop(); +// List counts = is.getCounts(); +// + List cents = new ArrayList(); + int i = 0; + //get rid of size one clusters that are there just because they were added to the list at the end + for (; i < is.size() ; i++) { +// if(is.get(i).count==1)break; + cents.addAll(is.get(i).getTop()); + } + + ; +// counts = counts.subList(0, i); + Clusterer offlineclusterer = new KMeansPlusPlus(); + offlineclusterer.setData(cents); + offlineclusterer.setK(so.getk()); + cents = offlineclusterer.getCentroids(); + + + +// while(centroids.size()so.getk())cents = offlineclusterer.getCentroids(); +// if(cents.size() vecs = so.getVectorIterator(); +// while (vecs.hasNext()) { +// if (so.getParallel()) { +// float[] vec = vecs.next(); +// executor.execute(new VectorLevelConcurrency(vec, lshfuncs,is,so)); +// } else { +// addVectorOnlineStep(vecs.next()); +// } +// } + } + + public List getTopIdSizes() { + return null; +// return is.getCounts(); + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + + } + + @Override + public void setRawData(List data) + { +// this.centroids = new ArrayList(data.size()); +// for(float[] f: data){ +// this.data.add(new Centroid(f,0)); +// } + } + + @Override + public void setData(List centroids) { + ArrayList data = new ArrayList(centroids.size()); + for(Centroid c : centroids)data.add(c.centroid()); + setRawData(data); + } + + + @Override + public void setK(int getk) { + + } + + @Override + public void shutdown() { + if (so.getParallel()) { + executor.shutdown(); + try { +// System.out.println("Shutting Down"); + executor.awaitTermination(1200, TimeUnit.SECONDS); + } catch (InterruptedException e) { + e.printStackTrace(); + } + executor = Executors.newFixedThreadPool(this.processors ); + } + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + @Override + public boolean setMultiRun(int runs) { + return false; + } + +} diff --git a/src/main/java/edu/uc/rphash/RPHash.java b/src/main/java/edu/uc/rphash/RPHash.java index e62b867..b0ed7be 100644 --- a/src/main/java/edu/uc/rphash/RPHash.java +++ b/src/main/java/edu/uc/rphash/RPHash.java @@ -22,11 +22,11 @@ import edu.uc.rphash.decoders.DepthProbingLSH; import edu.uc.rphash.decoders.Dn; import edu.uc.rphash.decoders.E8; -import edu.uc.rphash.decoders.Golay; + import edu.uc.rphash.decoders.Leech; import edu.uc.rphash.decoders.MultiDecoder; import edu.uc.rphash.decoders.OriginDecoder; -import edu.uc.rphash.decoders.PsdLSH; + import edu.uc.rphash.decoders.Spherical; import edu.uc.rphash.projections.DBFriendlyProjection; import edu.uc.rphash.projections.FJLTProjection; @@ -34,13 +34,13 @@ import edu.uc.rphash.projections.NoProjection; import edu.uc.rphash.projections.SVDProjection; import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.clusterers.AdaptiveMeanShift; + import edu.uc.rphash.tests.clusterers.Agglomerative3; import edu.uc.rphash.tests.clusterers.DummyClusterer; import edu.uc.rphash.tests.clusterers.DBScan; import edu.uc.rphash.tests.clusterers.KMeans2; import edu.uc.rphash.tests.clusterers.KMeansPlusPlus; -import edu.uc.rphash.tests.clusterers.LloydIterativeKmeans; + import edu.uc.rphash.tests.clusterers.MultiKMPP; import edu.uc.rphash.tests.clusterers.StreamingKmeans; import edu.uc.rphash.tests.clusterers.StreamingKmeans2; @@ -51,17 +51,26 @@ public class RPHash { static String[] clusteringmethods = { "simple", "streaming", "multiproj", - "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" }; + "kmeans", "pkmeans","kmeansplusplus", "streamingkmeans", "adaptive","dummy" ,"twrp" , "twrpbisect", "twrpbest", "twrpmergetree", "twrpbest_afterpruning", + "twrpbest_cov","twrpbest_meanvariance" }; + static String[] offlineclusteringmethods = { "singlelink", "completelink", "averagelink", "kmeans", "adaptivemeanshift", "kmpp", "multikmpp" , "dbscan", "none" }; + static String[] projectionmethods = { "dbf", "fjlt", "rp", "svd", "noproj" }; + static String[] ops = { "numprojections", "innerdecodermultiplier", "numblur", "randomseed", "hashmod", "parallel", "streamduration", "raw", "decayrate", "dimparameter", "decodertype", "offlineclusterer", "runs", "normalize", "projection" }; + static String[] decoders = { "dn", "e8", "golay", "multie8", "leech", "multileech", "sphere", "levypstable", "cauchypstable", "gaussianpstable", "adaptive", "origin" }; + + static String[] twrp_options = { "cutoff", "randomvector" }; + + public static void main(String[] args) throws NumberFormatException, IOException, InterruptedException { @@ -95,6 +104,12 @@ public static void main(String[] args) throws NumberFormatException, System.out.print(s + " ,"); System.out.print("]\n"); + System.out.print("\t twrp_options" + "\t:["); + for (String s : twrp_options) + System.out.print(s + " ,"); + System.out.print("]\n"); + + System.exit(0); } @@ -114,6 +129,9 @@ public static void main(String[] args) throws NumberFormatException, matched |= keyword.equals(match); for (String match : decoders) matched |= keyword.equals(match); + for (String match : twrp_options) + matched |= keyword.equals(match); + if (!matched) unmatchedkeywords.add(keyword); } @@ -503,6 +521,20 @@ public static List runConfigs(List untaggedArgs, o.setNormalize(Boolean.parseBoolean(taggedArgs.get("normalize"))); so.setNormalize(Boolean.parseBoolean(taggedArgs.get("normalize"))); } + + + if (taggedArgs.containsKey("cutoff")) { + o.setCutoff(Integer.parseInt(taggedArgs.get("cutoff"))); + so.setCutoff(Integer.parseInt(taggedArgs.get("cutoff"))); + } + + + if (taggedArgs.containsKey("randomvector")) { + o.setRandomVector(Boolean.parseBoolean(taggedArgs.get("randomvector"))); + so.setRandomVector(Boolean.parseBoolean(taggedArgs.get("randomvector"))); + } + + if (taggedArgs.containsKey("projection")) { switch (taggedArgs.get("projection")) { @@ -552,11 +584,7 @@ public static List runConfigs(List untaggedArgs, so.setDecoderType(new E8(2f)); break; } - case "golay": { - o.setDecoderType(new Golay()); - so.setDecoderType(new Golay()); - break; - } + case "multie8": { o.setDecoderType(new MultiDecoder( o.getInnerDecoderMultiplier() * 8, new E8(2f))); @@ -576,23 +604,9 @@ public static List runConfigs(List untaggedArgs, .getInnerDecoderMultiplier() * 24, new Leech())); break; } - case "levypstable": { - o.setDecoderType(new PsdLSH(PsdLSH.LEVY, o.getDimparameter())); - so.setDecoderType(new PsdLSH(PsdLSH.LEVY, o.getDimparameter())); - break; - } - case "cauchypstable": { - o.setDecoderType(new PsdLSH(PsdLSH.CAUCHY, o.getDimparameter())); - so.setDecoderType(new PsdLSH(PsdLSH.CAUCHY, o.getDimparameter())); - break; - } - case "gaussianpstable": { - o.setDecoderType(new PsdLSH(PsdLSH.GAUSSIAN, o - .getDimparameter())); - so.setDecoderType(new PsdLSH(PsdLSH.GAUSSIAN, o - .getDimparameter())); - break; - } + + + case "sphere": {// pad to ~32 bits // int ctsofsphere = // (int)(Math.log(o.getDimparameter()*2)/Math.log(2.0)) /2; @@ -659,13 +673,7 @@ public static List runConfigs(List untaggedArgs, o.setOfflineClusterer(new KMeansPlusPlus()); so.setOfflineClusterer(new KMeansPlusPlus()); break; - case "adaptivemeanshift": { - - o.setOfflineClusterer(new AdaptiveMeanShift()); - so.setOfflineClusterer(new AdaptiveMeanShift()); - - break; - } + case "kmpp": { o.setOfflineClusterer(new KMeansPlusPlus()); @@ -727,11 +735,7 @@ public static List runConfigs(List untaggedArgs, runitems.add(new KMeans2(k, o.getRawData())); break; } - case "pkmeans": - runitems.add(new LloydIterativeKmeans(k, o.getRawData(), o - .getNumProjections())); - break; - + case "kmeansplusplus": runitems.add(new KMeansPlusPlus(o.getRawData(), k)); break; @@ -743,14 +747,50 @@ public static List runConfigs(List untaggedArgs, runitems.add(new StreamingKmeans2(o)); break; } - case "adaptivemeanshift": { - runitems.add(new AdaptiveMeanShift()); - break; - } + case "adaptive": { runitems.add(new RPHashAdaptive2Pass(o)); break; } + + case "twrp": { + runitems.add(new TWRPv2(o)); + break; + } + + case "twrpmergetree": { + runitems.add(new TWRPv3(o)); + break; + } + + case "twrpbisect": { + runitems.add(new TWRPv4(o)); + break; + } + + case "twrpbest": { + runitems.add(new TWRPv5_WCSS(o)); + break; + } + + case "twrpbest_afterpruning": { + runitems.add(new TWRPv6_WCSS2(o)); + break; + } + + case "twrpbest_cov": { + runitems.add(new TWRPv6_COV(o)); + break; + } + + case "twrpbest_meanvariance": { + runitems.add(new TWRPv6_meanVariance(o)); + break; + } + + + + case "dummy": { runitems.add(new DummyClusterer(so)); break; diff --git a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java index 013ea27..927bf7f 100644 --- a/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java +++ b/src/main/java/edu/uc/rphash/RPHashAdaptive2Pass.java @@ -1,8 +1,10 @@ package edu.uc.rphash; +import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; @@ -16,11 +18,12 @@ import edu.uc.rphash.tests.StatTests; import edu.uc.rphash.tests.clusterers.Agglomerative3; import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; public class RPHashAdaptive2Pass implements Clusterer, Runnable { - boolean znorm = true; + boolean znorm = false; private int counter; @@ -63,17 +66,21 @@ float[] medoid(List X) { //float[] rngvec; the range vector is moot if incoming data has been normalized //post normalization it should all be zero centered, with variance 1 - /* * super simple hash algorithm, reminiscient of pstable lsh */ + // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. + public long hashvec(float[] xt, float[] x, HashMap> IDAndCent, HashMap> IDAndLabel,int ct) { - long s = 1;//fixes leading 0's bug + long s = 1; //fixes leading 0's bug for (int i = 0; i < xt.length; i++) { - s <<= 1; +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. if (xt[i] > rngvec[i]) - s += 1; +// s += 1; + s= s+1; + if (IDAndCent.containsKey(s)) { IDAndLabel.get(s).add(ct); IDAndCent.get(s).add(x); @@ -143,16 +150,17 @@ public List> findDensityModes() { projector.init(); int ct = 0; - if(znorm == true){ - float[] variance = StatTests.varianceCol(so.getRawData()); - float[] mean = StatTests.meanCols(so.getRawData()); - // #process data by adding to the counter - for (float[] x : so.getRawData()) - { - addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); - } - } - else +// if(znorm == true){ +// float[] variance = StatTests.varianceCol(so.getRawData()); +// float[] mean = StatTests.meanCols(so.getRawData()); +// // #process data by adding to the counter +// for (float[] x : so.getRawData()) +// { +// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); +// } +// } +// +// else { for (float[] x : so.getRawData()) @@ -161,6 +169,24 @@ public List> findDensityModes() { } } + +// for (Long name: IDAndCent.keySet()){ +// +// String key =name.toString(); +// // String value = IDAndCent.get(name).toString() ; +// // String value1 = Arrays.toString(value.toString()); +// System.out.println(key ) ;//+ " " + value); +//} + + System.out.println("NumberOfMicroClustersBeforePruning = "+ IDAndCent.size()); +// for (Long name: IDAndID.keySet()){ +// String key =name.toString(); +// String value = IDAndID.get(name).toString(); +// System.out.println(key + " " + value); +// +// +//} + // next we want to prune the tree by parent count comparison // follows breadthfirst search HashMap denseSetOfIDandCount = new HashMap(); @@ -197,9 +223,11 @@ public List> findDensityModes() { List sortedIDList= new ArrayList<>(); // sort and limit the list - stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*6) .forEachOrdered(x -> sortedIDList.add(x.getKey())); + System.out.println("NumberOfMicroClustersAfterPruning = "+ sortedIDList.size()); + // compute centroids HashMap> estcents = new HashMap<>(); @@ -207,12 +235,16 @@ public List> findDensityModes() { { estcents.put(sortedIDList.get(i), IDAndCent.get(sortedIDList.get(i))); } + + // System.out.println(); // for (int i =0; i centroids = null; - private RPHashObject so; - int threads = 4; - - public RPHashAdaptive2PassParallel(RPHashObject so) { - this.threads = 4; - this.so = so; - } - - public RPHashAdaptive2PassParallel(List data, int k, int processors) { - this.threads = processors; - so = new SimpleArrayReader(data, k); - } - - public List getCentroids(RPHashObject so) { - this.so = so; - return getCentroids(); - } - - @Override - public List getCentroids() { - if (centroids == null) - run(); - return centroids; - } - - /* - * X - set of vectors compute the medoid of a vector set - */ - float[] medoid(List X) { - float[] ret = X.get(0); - for (int i = 1; i < X.size(); i++) { - for (int j = 0; j < ret.length; j++) { - ret[j] += X.get(i)[j]; - } - } - for (int j = 0; j < ret.length; j++) { - ret[j] = ret[j] / ((float) X.size()); - } - return ret; - } - - // float[] rngvec; the range vector is moot if incoming data has been - // normalized - // post normalization it should all be zero centered, with variance 1 - - /* - * super simple hash algorithm, reminiscient of pstable lsh - */ - public long hashvec(float[] xt, float[] x, - Map> IDAndCent, - Map> IDAndLabel, int ct) { - long s = 1;// fixes leading 0's bug - for (int i = 0; i < xt.length; i++) { - s <<= 1; - if (xt[i] > rngvec[i]) - s += 1; - if (IDAndCent.containsKey(s)) { - if (IDAndLabel.get(s) != null) - IDAndLabel.get(s).add(ct); - if (IDAndCent.get(s) != null) - IDAndCent.get(s).add(x); - } else { - ArrayList xlist = new ArrayList<>(); - xlist.add(x); - IDAndCent.put(s, xlist); - ArrayList idlist = new ArrayList<>(); - idlist.add(ct); - IDAndLabel.put(s, idlist); - } - } - return s; - } - - /* - * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid - * vector map - * - * hash the projected vector x and update the hash to centroid and counts - * maps - */ - void addtocounter(float[] x, Projector p, - Map> IDAndCent, - Map> IDandID, int ct) { - float[] xt = p.project(x); - - hashvec(xt, x, IDAndCent, IDandID, ct); - } - - /* - * X - data set k - canonical k in k-means l - clustering sub-space Compute - * density mode via iterative deepening hash counting - */ - public Collection> findDensityModes() - throws InterruptedException, ExecutionException { - - // #create projector matrixs - Projector projector = so.getProjectionType(); - projector.setOrigDim(so.getdim()); - projector.setProjectedDim(so.getDimparameter()); - projector.setRandomSeed(so.getRandomSeed()); - projector.init(); - - // int ct = 1; - - List dat = so.getRawData(); - - //this counter gets shared - AtomicInteger ct = new AtomicInteger(0); - - ForkJoinPool executor = new ForkJoinPool(this.threads); - - int chunksize = dat.size() / this.threads; - - //This is the array of essentially thread objets that process in parallel - ArrayList>>> gather = new ArrayList<>(this.threads); - - for (int i = 0; i < this.threads; i++) { - - int chunk = chunksize* i; - gather.add(executor.submit(new Callable>>() { - - // this is the mapper function. the dataset is split among the processing threads - // each thread performs the projections and counter adds. - // this method is sequentially bottlenecked in regard to the add part - // there are some ways to fix this, but ultimately each thread needs to maintain - // its own count-sketch. then those sketch must be merged, via the binary - // operation - public Map> call() { - Map> IDAndCent = new HashMap<>(); - Map> IDAndID = new HashMap<>(); - for (int j = chunk; j < chunksize + chunk && j < dat.size(); j++) { - addtocounter(dat.get(j), projector, IDAndCent, IDAndID, - ct.incrementAndGet()); - } - return IDAndCent ;//new Object[] { IDAndCent, IDAndID }; - } - })); - } - - List>> gatheredCent = new ArrayList<>(this.threads); -// List> gatheredID = new ArrayList<>(this.threads); - -// executor.awaitTermination(10,TimeUnit.SECONDS); - for (Future>> f : gather) { - Map> o = f.get(); - gatheredCent.add(o); -// gatheredID.add((Map) o[1]); - } - - executor.shutdown(); - - - // this function merges the centroid sets in parallel. - // it would be the basis of the reduce part - // even though the functions are called map, the return is a collection/gather operation - Map IDAndCent = gatheredCent - .stream() - .parallel() - .map(Map::entrySet) - .flatMap(Collection::stream) - .collect( - Collectors.toConcurrentMap(Map.Entry::getKey, - Map.Entry::getValue, - (old, latest)->{ - old.addAll(latest); - return old; - } - )); - - - //this is sequential... - // next we want to prune the tree by parent count comparison - // follows breadthfirst search - HashMap denseSetOfIDandCount = new HashMap(); - for (Long cur_id : new TreeSet(IDAndCent.keySet())) { - if (cur_id > so.getk()) { - int cur_count = IDAndCent.get(cur_id).size(); - long parent_id = cur_id >>> 1; - int parent_count = IDAndCent.get(parent_id).size(); - - if (cur_count != 0 && parent_count != 0) { - if (cur_count == parent_count) { - denseSetOfIDandCount.put(parent_id, 0L); - IDAndCent.put(parent_id, new ArrayList<>()); - denseSetOfIDandCount.put(cur_id, (long) cur_count); - } else { - if (2 * cur_count > parent_count) { - denseSetOfIDandCount.remove(parent_id); - IDAndCent.put(parent_id, new ArrayList<>()); - denseSetOfIDandCount.put(cur_id, (long) cur_count); - } - } - } - } - } - - // remove keys with support less than 1 - Stream> stream = denseSetOfIDandCount.entrySet() - .parallelStream().filter(p -> p.getValue() > 1); - // 64 so 6 bits? - // stream = stream.filter(p -> p.getKey() > 64); - - List sortedIDList = new ArrayList<>(); - // sort and limit the list - stream.sorted(Entry. comparingByValue().reversed()) - .limit(so.getk() * 4).parallel() - .forEachOrdered(x -> sortedIDList.add(x.getKey())); - - // compute centroids - - HashMap> estcents = new HashMap<>(); - for (int i = 0; i < sortedIDList.size(); i++) - { - estcents.put(sortedIDList.get(i), - new ArrayList(IDAndCent.get(sortedIDList.get(i)))); - } - - return estcents.values(); - } - - public void run() { - rngvec = new float[so.getDimparameter()]; - Random r = new Random(so.getRandomSeed()); - for (int i = 0; i < so.getDimparameter(); i++) - rngvec[i] = (float) r.nextGaussian(); - - Collection> clustermembers; - try { - clustermembers = findDensityModes(); - - List centroids = new ArrayList<>(); - - List weights = new ArrayList<>(); - int k = clustermembers.size() > 200 + so.getk() ? 200 + so.getk() - : clustermembers.size(); - - for (List cl : clustermembers) { - weights.add(new Float(cl.size())); - centroids.add(medoid(cl)); - } - - Agglomerative3 aggloOffline = new Agglomerative3(centroids, - so.getk()); - aggloOffline.setWeights(weights); - this.centroids = aggloOffline.getCentroids(); - } catch (InterruptedException | ExecutionException e) { - - e.printStackTrace(); - } - } - - public static void main(String[] args) throws FileNotFoundException, - IOException { - - int k = 10; - int d = 1000; - int n = 10000; - float var = 1.1f; - int count = 10; - System.out.printf("ClusterVar\t"); - for (int i = 0; i < count; i++) - System.out.printf("Trial%d\t", i); - System.out.printf("RealWCSS\n"); - - for (float f = var; f < 5.01; f += .05f) { - float avgrealwcss = 0; - float avgtime = 0; - System.out.printf("%f\t", f); - for (int i = 0; i < count; i++) { - GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); - // gen.writeCSVToFile(new - // File("/home/lee/Desktop/reclsh/in.csv")); - RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDimparameter(32); - - RPHashAdaptive2PassParallel rphit = new RPHashAdaptive2PassParallel( - o); - long startTime = System.nanoTime(); - List centsr = rphit.getCentroids(); - - avgtime += (System.nanoTime() - startTime) / 100000000; - - avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), - gen.getData()); - - System.out.printf("%.0f\t", - StatTests.WCSSECentroidsFloat(centsr, gen.data)); - System.gc(); - } - System.out.printf("%.0f\n", avgrealwcss / count); - } - } - - @Override - public RPHashObject getParam() { - return so; - } - - @Override - public void setWeights(List counts) { - // TODO Auto-generated method stub - - } - - @Override - public void setData(List centroids) { - this.centroids = centroids; - - } - - @Override - public void setRawData(List centroids) { - if (this.centroids == null) - this.centroids = new ArrayList<>(centroids.size()); - for (float[] f : centroids) { - this.centroids.add(new Centroid(f, 0)); - } - } - - @Override - public void setK(int getk) { - this.so.setK(getk); - } - - @Override - public void reset(int randomseed) { - centroids = null; - so.setRandomSeed(randomseed); - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } -} diff --git a/src/main/java/edu/uc/rphash/RPHashMultiProj.java b/src/main/java/edu/uc/rphash/RPHashMultiProj.java deleted file mode 100644 index 18f6f47..0000000 --- a/src/main/java/edu/uc/rphash/RPHashMultiProj.java +++ /dev/null @@ -1,307 +0,0 @@ -package edu.uc.rphash; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.decoders.Decoder; -import edu.uc.rphash.decoders.Leech; -import edu.uc.rphash.decoders.Spherical; -import edu.uc.rphash.frequentItemSet.ItemSet; -import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; -import edu.uc.rphash.lsh.LSH; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.standardhash.HashAlgorithm; -import edu.uc.rphash.standardhash.NoHash; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.generators.GenerateData; - -/** - * This is the correlated multi projections approach. In this RPHash variation - * we try to incorporate the advantage of multiple random projections in order - * to combat increasing cluster error rates as the deviation between projected - * and full data increases. The main idea is similar to the referential RPHash, - * however the set union is projection id dependent. This will be done in a - * simplified bitmask addition to the hash code in lieu of an array of sets data - * structures. - * - * @author lee - * - */ -public class RPHashMultiProj implements Clusterer { - float variance; - - public RPHashObject map() { - Iterator vecs = so.getVectorIterator(); - if (!vecs.hasNext()) - return so; - - long[] hash; - int projections = so.getNumProjections(); - - int k = (int) (so.getk() * 2); - - // initialize our counter - ItemSet is = new SimpleFrequentItemSet(k); - // create our LSH Device - // create same LSH Device as before - - Random r = new Random(so.getRandomSeed()); - LSH[] lshfuncs = new LSH[projections]; - Decoder dec = so.getDecoderType(); - dec.setCounter(is); - HashAlgorithm hal = new NoHash(so.getHashmod()); - - // create same projection matrices as before - for (int i = 0; i < projections; i++) { - Projector p = so.getProjectionType(); - p.setOrigDim(so.getdim()); - p.setProjectedDim(dec.getDimensionality()); - p.setRandomSeed(r.nextLong()); - p.init(); - - List noise = LSH.genNoiseTable(dec.getDimensionality(), - so.getNumBlur(), r, - dec.getErrorRadius() / dec.getDimensionality()); - - lshfuncs[i] = new LSH(dec, p, hal, noise, so.getNormalize()); - } - - // add to frequent itemset the hashed Decoded randomly projected vector - while (vecs.hasNext()) { - float[] vec = vecs.next(); - // iterate over the multiple projections - for (LSH lshfunc : lshfuncs) { - // could do a big parallel projection here - hash = lshfunc.lshHashRadius(vec, so.getNumBlur()); - for (long hh : hash) { - is.add(hh); - } - } - } - so.setPreviousTopID(is.getTop()); - List countsAsFloats = new ArrayList(); - for (long ct : is.getCounts()) - countsAsFloats.add((float) ct); - so.setCounts(countsAsFloats); - return so; - } - - /* - * This is the second phase after the top ids have been in the reduce phase - * aggregated - */ - public RPHashObject reduce() { - Iterator vecs = so.getVectorIterator(); - if (!vecs.hasNext()) - return so; - - // make a set of k default centroid objects - ArrayList centroids = new ArrayList(); - for (long id : so.getPreviousTopID()) - centroids.add(new Centroid(so.getdim(), id, -1)); - - long[] hash; - int projections = so.getNumProjections(); - - // create our LSH Device - // create same LSH Device as before - Random r = new Random(so.getRandomSeed()); - LSH[] lshfuncs = new LSH[projections]; - Decoder dec = so.getDecoderType(); - HashAlgorithm hal = new NoHash(so.getHashmod()); - - // create same projection matrices as before - for (int i = 0; i < projections; i++) { - Projector p = so.getProjectionType(); - p.setOrigDim(so.getdim()); - p.setProjectedDim(dec.getDimensionality()); - p.setRandomSeed(r.nextLong()); - p.init(); - List noise = LSH.genNoiseTable(dec.getDimensionality(), - so.getNumBlur(), r, - dec.getErrorRadius() / dec.getDimensionality()); - lshfuncs[i] = new LSH(dec, p, hal, noise, so.getNormalize()); - } - - while (vecs.hasNext()) { - float[] vec = vecs.next(); - // iterate over the multiple projections - for (LSH lshfunc : lshfuncs) { - // could do a big parallel projection here - hash = lshfunc.lshHashRadius(vec, so.getNumBlur()); - for (Centroid cent : centroids) { - for (long hh : hash) { - if (cent.ids.contains(hh)) { - cent.updateVec(vec); - cent.addID(hh); - } - } - } - } - } - so.setCentroids(centroids); - return so; - } - - private List centroids = null; - private RPHashObject so; - private int runs; - - public RPHashMultiProj(int k, List data) { - so = new SimpleArrayReader(data, k); - runs = 1; - } - - public RPHashMultiProj(RPHashObject so) { - this.so = so; - } - - public RPHashMultiProj() { - so = new SimpleArrayReader(); - } - - public List getCentroids(RPHashObject so) { - this.so = so; - - if (centroids == null) - run(); - return centroids; - } - - @Override - public List getCentroids() { - if (centroids == null) { - run(); - } - return centroids; - } - - private void run() { - runs = 1; - double minwcss = Double.MAX_VALUE; - List mincentroids = new ArrayList<>(); - for (int currun = 0; currun < runs;) { - - map(); - reduce(); - - Clusterer offlineclusterer = so.getOfflineClusterer(); - List tmpcents; - if (offlineclusterer != null) { - offlineclusterer.setMultiRun(1);// is deterministic - offlineclusterer.setData(so.getCentroids()); - offlineclusterer.setWeights(so.getCounts()); - offlineclusterer.setK(so.getk()); - tmpcents = offlineclusterer.getCentroids(); - } else { - tmpcents = so.getCentroids().subList(0, so.getk()); - } - - if (tmpcents.size() == so.getk()) {// skip bad clusterings - double tmpwcss = StatTests.WCSSECentroidsFloat(tmpcents, - so.getRawData()); - // System.out.println(tmpwcss + ":" + so.getCounts()); - if (tmpwcss < minwcss) { - minwcss = tmpwcss; - mincentroids = tmpcents; - } - currun++; - } - - this.reset(new Random().nextInt()); - - } - - this.centroids = mincentroids; - } - - public static void main(String[] args) { - - int k = 10; - int d = 1000; - int n = 10000; - float var = .6f; - int count = 5; - System.out.printf("Decoder: %s\n","Spherical"); - System.out.printf("ClusterVar\t"); - for (int i = 0; i < count; i++) - System.out.printf("Trial%d\t", i); - System.out.printf("RealWCSS\n"); - - - - for (float f = var; f < 3.01; f += .1f) { - float avgrealwcss = 0; - float avgtime = 0; - System.out.printf("%f\t", f); - for (int i = 0; i < count; i++) { - GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); - RPHashObject o = new SimpleArrayReader(gen.data, k); - o.setDecoderType(new Spherical(32,4,1)); - o.setDimparameter(32); - RPHashMultiProj rphit = new RPHashMultiProj(o); - long startTime = System.nanoTime(); - List centsr = rphit.getCentroids(); - - avgtime += (System.nanoTime() - startTime) / 100000000; - - avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), - gen.getData()); - - System.out.printf("%.0f\t", - StatTests.WCSSECentroidsFloat(centsr, gen.data)); - System.gc(); - - } - System.out.printf("%.0f\n", avgrealwcss / count); - } - } - - @Override - public RPHashObject getParam() { - return so; - } - - @Override - public void setWeights(List counts) { - } - - @Override - public void setData(List data) { - centroids = new ArrayList<>(); - for (Centroid c : data) { - so.addRawData(c.centroid); - } - so.setDimparameter(data.get(0).dimensions); - } - - @Override - public void setK(int getk) { - this.so.setK(getk); - } - - @Override - public void setRawData(List data) { - so.setRawData(data); - this.so.setDimparameter(data.get(0).length); - } - - @Override - public void reset(int randomseed) { - centroids = null; - so.setRandomSeed(randomseed); - } - - @Override - public boolean setMultiRun(int runs) { - this.runs = runs; - return true; - } - -} diff --git a/src/main/java/edu/uc/rphash/RPHashSimple.java b/src/main/java/edu/uc/rphash/RPHashSimple.java index 2e8cba9..b809e7f 100644 --- a/src/main/java/edu/uc/rphash/RPHashSimple.java +++ b/src/main/java/edu/uc/rphash/RPHashSimple.java @@ -15,6 +15,7 @@ import edu.uc.rphash.decoders.DepthProbingLSH; import edu.uc.rphash.decoders.Leech; import edu.uc.rphash.decoders.Spherical; +import edu.uc.rphash.decoders.SphericalRandom; import edu.uc.rphash.frequentItemSet.ItemSet; import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; import edu.uc.rphash.lsh.LSH; @@ -24,6 +25,7 @@ import edu.uc.rphash.standardhash.NoHash; import edu.uc.rphash.tests.StatTests; import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.MultiKMPP; import edu.uc.rphash.tests.generators.GenerateData; import edu.uc.rphash.tests.generators.GenerateStreamData; import edu.uc.rphash.tests.kmeanspp.KMeansPlusPlus; @@ -54,8 +56,8 @@ public RPHashObject map() { int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and // round to // integer - int k = so.getk() * logk; - is = new SimpleFrequentItemSet(k); + int k1 = so.getk() * logk; + is = new SimpleFrequentItemSet(k1); Decoder dec = so.getDecoderType(); dec.setCounter(is); @@ -205,13 +207,27 @@ public void accept(float[] t) { // // } + Clusterer offlineclusterer = so.getOfflineClusterer(); offlineclusterer.setData(centroids); offlineclusterer.setWeights(so.getCounts()); offlineclusterer.setK(so.getk()); + + // System.out.println("\n k sent to offline = "+ so.getk()); + this.centroids = offlineclusterer.getCentroids(); + + //System.out.println("\n cents in reduce from offline cluster = "+ this.centroids.size()); + + //System.out.println("\n cents in reduce after label mapping = "+ centroids.size()); + this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids); - so.setCentroids(centroids); + + //so.setCentroids(centroids); + so.setCentroids(this.centroids); + + + return so; } @@ -272,12 +288,14 @@ private void run() { map(); reduce(); this.centroids = so.getCentroids(); + + } public static void main(String[] args) { int k = 10; - int d = 1000; - int n = 10000; + int d = 200; + int n = 1000; float var = 1f; int count = 5; System.out.printf("Decoder: %s\n", "Sphere"); @@ -294,19 +312,27 @@ public static void main(String[] args) { GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); RPHashObject o = new SimpleArrayReader(gen.data, k); RPHashSimple rphit = new RPHashSimple(o); - o.setDecoderType(new Spherical(32, 4, 1)); + o.setDecoderType(new SphericalRandom(32, 4, 1)); + //o.setDecoderType(new Spherical(32, 4, 1)); // o.setDimparameter(31); - o.setOfflineClusterer(new KMeans2()); + //o.setOfflineClusterer(new KMeans2()); + o.setOfflineClusterer(new MultiKMPP()); + + //System.out.println("\n k sent to offline in MAIN = "+ o.getk()); + long startTime = System.nanoTime(); List centsr = rphit.getCentroids(); + + //System.out.println("\n no of final cents : " + centsr.size()); + + avgtime += (System.nanoTime() - startTime) / 100000000; - // avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), - // gen.getData()); + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); - // System.out.printf("%.0f\t", - // StatTests.WCSSECentroidsFloat(centsr, gen.data)); - // System.gc(); + System.out.printf("%.0f\t", + StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); } System.out.printf("%.0f\n", avgrealwcss / count); diff --git a/src/main/java/edu/uc/rphash/RPHashSimple_multiPosLsh.java b/src/main/java/edu/uc/rphash/RPHashSimple_multiPosLsh.java new file mode 100644 index 0000000..7a37edf --- /dev/null +++ b/src/main/java/edu/uc/rphash/RPHashSimple_multiPosLsh.java @@ -0,0 +1,384 @@ +package edu.uc.rphash; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.function.Consumer; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.decoders.Decoder; +import edu.uc.rphash.decoders.DepthProbingLSH; +import edu.uc.rphash.decoders.Leech; +import edu.uc.rphash.decoders.Spherical; +import edu.uc.rphash.decoders.SphericalRandom; +import edu.uc.rphash.frequentItemSet.ItemSet; +import edu.uc.rphash.frequentItemSet.SimpleFrequentItemSet; +import edu.uc.rphash.lsh.LSH; +//import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.standardhash.HashAlgorithm; +import edu.uc.rphash.standardhash.NoHash; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.MultiKMPP; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.tests.generators.GenerateStreamData; +import edu.uc.rphash.tests.kmeanspp.KMeansPlusPlus; +import edu.uc.rphash.util.VectorUtil; + +public class RPHashSimple_multiPosLsh implements Clusterer { + // float variance; + + public ItemSet is; + + List labels; + HashMap labelmap; + + public static void mapfunc(float[] vec, LSH lshfunc, ItemSet is) { + + long hash = lshfunc.lshHash(vec); + is.add(hash); + } + + public RPHashObject map() { + + // create our LSH Machine + HashAlgorithm hal = new NoHash(so.getHashmod()); + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + + int logk = (int) (.5 + Math.log(so.getk()) / Math.log(2));// log k and + // round to + // integer + int k1 = so.getk() * logk; + is = new SimpleFrequentItemSet(k1); + Decoder dec = so.getDecoderType(); + dec.setCounter(is); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + // no noise to start with + List noise = LSH.genNoiseTable( + dec.getDimensionality(), + so.getNumBlur(), + new Random(), + dec.getErrorRadius() + / (dec.getDimensionality() * dec.getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + + // add to frequent itemset the hashed Decoded randomly projected vector + + if (so.getParallel()) { + List data = so.getRawData(); + data.parallelStream().forEach(new Consumer() { + + @Override + public void accept(float[] t) { + mapfunc(t, lshfunc, is); + } + }); + } + while (vecs.hasNext()) { + mapfunc(vecs.next(), lshfunc, is); + } + // } + // while (vecs.hasNext()) { + // float[] vec = vecs.next(); + // long hash = lshfunc.lshHash(vec); + // is.add(hash); + // + // } + + List topids = is.getTop(); + so.setPreviousTopID(topids); + + List topsizes = is.getCounts(); + + List countsAsFloats = new ArrayList(); + for (long ct : topsizes) + countsAsFloats.add((float) ct); + so.setCounts(countsAsFloats); + return so; + } + + public static void redFunc(float[] vec, LSH lshfunc, List noise, + List labels, List centroids) { + long[] hash = lshfunc.lshHashRadius(vec, noise); + labels.add(-1l); + // radius probe around the vector + for (Centroid cent : centroids) { + for (long h : hash) { + if (cent.ids.contains(h)) { + cent.updateVec(vec); + labels.set(labels.size() - 1, cent.id); + } + } + } + } + + /* + * This is the second phase after the top ids have been in the reduce phase + * aggregated + */ + public RPHashObject reduce() { + + Iterator vecs = so.getVectorIterator(); + if (!vecs.hasNext()) + return so; + float[] vec = vecs.next(); + + HashAlgorithm hal = new NoHash(so.getHashmod()); + Decoder dec = so.getDecoderType(); + + Projector p = so.getProjectionType(); + p.setOrigDim(so.getdim()); + p.setProjectedDim(dec.getDimensionality()); + p.setRandomSeed(so.getRandomSeed()); + p.init(); + + List noise = LSH.genNoiseTable( + so.getdim(), + so.getNumBlur(), + new Random(so.getRandomSeed()), + (float) (dec.getErrorRadius()) + / (float) (dec.getDimensionality() * dec + .getDimensionality())); + + LSH lshfunc = new LSH(dec, p, hal, noise, so.getNormalize()); + List centroids = new ArrayList(); + + for (long id : so.getPreviousTopID()) { + centroids.add(new Centroid(so.getdim(), id, -1)); + } + + this.labels = new ArrayList<>(); + + if (so.getParallel()) { + try { + List data = so.getRawData(); + ForkJoinPool myPool = new ForkJoinPool(this.threads); + myPool.submit(() -> + + data.parallelStream().forEach(new Consumer() { + + @Override + public void accept(float[] t) { + redFunc(t, lshfunc, noise, labels, centroids); + } + + })).get(); + + } catch (InterruptedException | ExecutionException e) { + e.printStackTrace(); + } + + } else { + while (vecs.hasNext()) { + redFunc(vecs.next(), lshfunc, noise, labels, centroids); + } + } + + // while (vecs.hasNext()) + // { + // + // long[] hash = lshfunc.lshHashRadius(vec, noise); + // labels.add(-1l); + // //radius probe around the vector + // for (Centroid cent : centroids) { + // for (long h : hash) + // { + // if (cent.ids.contains(h)) { + // cent.updateVec(vec); + // this.labels.set(labels.size()-1,cent.id); + // } + // } + // } + // vec = vecs.next(); + // + // } + + + Clusterer offlineclusterer = so.getOfflineClusterer(); + offlineclusterer.setData(centroids); + offlineclusterer.setWeights(so.getCounts()); + offlineclusterer.setK(so.getk()); + + // System.out.println("\n k sent to offline = "+ so.getk()); + + this.centroids = offlineclusterer.getCentroids(); + + //System.out.println("\n cents in reduce from offline cluster = "+ this.centroids.size()); + + //System.out.println("\n cents in reduce after label mapping = "+ centroids.size()); + + this.labelmap = VectorUtil.generateIDMap(centroids, this.centroids); + + //so.setCentroids(centroids); + so.setCentroids(this.centroids); + + + + return so; + } + + // 271458 + // 264779.7 + + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + private List centroids = null; + private RPHashObject so; + + public RPHashSimple_multiPosLsh(List data, int k) { + so = new SimpleArrayReader(data, k); + } + + int threads = 1; + + public RPHashSimple_multiPosLsh(List data, int k, int processors) { + // System.setProperty("java.util.concurrent.ForkJoinPool.common.parallelism",String.valueOf(processors)); + threads = processors; + so = new SimpleArrayReader(data, k); + so.setParallel(true); + } + + public RPHashSimple_multiPosLsh(List data, int k, int times, int rseed) { + so = new SimpleArrayReader(data, k); + } + + public RPHashSimple_multiPosLsh(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + if (centroids == null) + run(); + return centroids; + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + + return centroids; + } + + private void run() { + map(); + reduce(); + this.centroids = so.getCentroids(); + + + } + + public static void main(String[] args) { + int k = 10; + int d = 200; + int n = 1000; + float var = 1f; + int count = 5; + System.out.printf("Decoder: %s\n", "Sphere"); + System.out.printf("ClusterVar\t"); + for (int i = 0; i < count; i++) + System.out.printf("Trial%d\t", i); + System.out.printf("RealWCSS\n"); + + for (float f = var; f < 3.01; f += .05f) { + float avgrealwcss = 0; + float avgtime = 0; + System.out.printf("%f\t", f); + for (int i = 0; i < count; i++) { + GenerateData gen = new GenerateData(k, n / k, d, f, true, 1f); + RPHashObject o = new SimpleArrayReader(gen.data, k); + RPHashSimple_multiPosLsh rphit = new RPHashSimple_multiPosLsh(o); + o.setDecoderType(new SphericalRandom(32, 4, 1)); + //o.setDecoderType(new Spherical(32, 4, 1)); + // o.setDimparameter(31); + //o.setOfflineClusterer(new KMeans2()); + o.setOfflineClusterer(new MultiKMPP()); + + //System.out.println("\n k sent to offline in MAIN = "+ o.getk()); + + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + //System.out.println("\n no of final cents : " + centsr.size()); + + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + System.out.printf("%.0f\t", + StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + } + System.out.printf("%.0f\n", avgrealwcss / count); + } + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + // TODO Auto-generated method stub + + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return true; + } +} diff --git a/src/main/java/edu/uc/rphash/RPHashStreamingAK.java b/src/main/java/edu/uc/rphash/RPHashStreamingAK.java deleted file mode 100644 index 873c20d..0000000 --- a/src/main/java/edu/uc/rphash/RPHashStreamingAK.java +++ /dev/null @@ -1,198 +0,0 @@ -package edu.uc.rphash; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.decoders.Decoder; -import edu.uc.rphash.frequentItemSet.KHHCentroidCounterPush; -import edu.uc.rphash.knee.LpointKnee; -import edu.uc.rphash.lsh.LSH; -import edu.uc.rphash.projections.DBFriendlyProjection; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.standardhash.HashAlgorithm; -import edu.uc.rphash.standardhash.MurmurHash; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.clusterers.LloydIterativeKmeans; -import edu.uc.rphash.tests.generators.ClusterGenerator; - -/**This is an adaptation of RPHash Streaming with support for - * automatic knee finding and time based cluster decay. - * @author lee - * - */ -public class RPHashStreamingAK implements StreamClusterer { - - public KHHCentroidCounterPush is; - private LSH[] lshfuncs; - private StatTests vartracker; - private List centroids = null; - private RPHashObject so; - - - @Override - public synchronized long addVectorOnlineStep(float[] vec) { - - if(!lshfuncs[0].lshDecoder.selfScaling()){ - this.vartracker.updateVarianceSampleVec(vec); - vec = this.vartracker.scaleVector(vec); - } - - - Centroid c = new Centroid(vec,-1); - int ret = -1; - - for (LSH lshfunc : lshfuncs) { - if (so.getNumBlur() != 1) { - long[] hash = lshfunc - .lshHashRadius(vec, so.getNumBlur()); - for (long h : hash) { - c.addID(h); - is.addLong(h, 1); - } - } else { - long hash = lshfunc.lshHash(vec); - c.addID(hash); - is.addLong(hash, 1); - } - } - ret = is.addAndUpdate(c); - - return ret; - } - - public void init() { - Random r = new Random(so.getRandomSeed()); - this.vartracker = new StatTests(.01f); - int projections = so.getNumProjections(); - - // initialize our counter - float decayrate = so.getDecayRate();// 1f;// bottom number is window - // size - is = new KHHCentroidCounterPush(decayrate,new LpointKnee()); - // create LSH Device - lshfuncs = new LSH[projections]; - Decoder dec = so.getDecoderType(); - HashAlgorithm hal = new MurmurHash(so.getHashmod()); - // create projection matrices add to LSH Device - for (int i = 0; i < projections; i++) { - Projector p = new DBFriendlyProjection(so.getdim(), - dec.getDimensionality(), r.nextLong()); - List noise = LSH.genNoiseTable(dec.getDimensionality(), - so.getNumBlur(), r, dec.getErrorRadius() - / dec.getDimensionality()); - lshfuncs[i] = new LSH(dec, p, hal, noise,so.getNormalize()); - } - } - - public RPHashStreamingAK(ClusterGenerator c) { - so = new SimpleArrayReader(c,0); - init(); - } - - public RPHashStreamingAK(RPHashObject so) { - this.so = so; - init(); - } - - - - @Override - public List getCentroids() { - if (centroids == null) { - init(); - run(); - getCentroidsOfflineStep(); - } - return centroids; - } - - public List getCentroidsOfflineStep() { - - centroids = is.getTop(); - - -// centroids = new ArrayList(); -// List counts = is.getCounts(); -// -// for (int i = 0; i < cents.size(); i++) { -// centroids.add(cents.get(i).centroid()); -// } - - Clusterer offlineclusterer = so.getOfflineClusterer(); - offlineclusterer.setWeights(so.getCounts()); - offlineclusterer.setData(so.getCentroids()); - offlineclusterer.setK(so.getk()); - centroids = offlineclusterer.getCentroids(); - - return centroids; - } - - public void run() { - // add to frequent itemset the hashed Decoded randomly projected - // vector - Iterator vecs = so.getVectorIterator(); - while (vecs.hasNext()) { - addVectorOnlineStep(vecs.next()); - } - } - - public List getTopIdSizes() { - return is.getCounts(); - } - - @Override - public RPHashObject getParam() { - return this.so; - } - - @Override - public void setWeights(List counts) { - // TODO Auto-generated method stub - - } - - @Override - public void setRawData(List data) { -// this.data = data; - } - - @Override - public void setData(List centroids) { - ArrayList data = new ArrayList(centroids.size()); - for(Centroid c : centroids)data.add(c.centroid()); - setRawData(data); - } - - @Override - public void setK(int getk) { - // TODO Auto-generated method stub - - } - - @Override - public void shutdown() { - // TODO Auto-generated method stub - - } - - @Override - public void reset(int randomseed) { - centroids = null; - so.setRandomSeed(randomseed); - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } - - @Override - public int getProcessors() { - return 1; - } - -} diff --git a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java index 499ae9f..6e7f16d 100644 --- a/src/main/java/edu/uc/rphash/Readers/RPHashObject.java +++ b/src/main/java/edu/uc/rphash/Readers/RPHashObject.java @@ -8,27 +8,28 @@ import edu.uc.rphash.decoders.Decoder; import edu.uc.rphash.decoders.DepthProbingLSH; import edu.uc.rphash.decoders.E8; -import edu.uc.rphash.decoders.Golay; + import edu.uc.rphash.decoders.Leech; import edu.uc.rphash.decoders.MultiDecoder; -import edu.uc.rphash.decoders.PsdLSH; + import edu.uc.rphash.decoders.Spherical; import edu.uc.rphash.projections.DBFriendlyProjection; +import edu.uc.rphash.projections.GaussianProjection; import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.tests.clusterers.Agglomerative; + import edu.uc.rphash.tests.clusterers.Agglomerative3; import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; import edu.uc.rphash.tests.clusterers.KMeans2; import edu.uc.rphash.tests.clusterers.KMeans2NoWCSS; import edu.uc.rphash.tests.clusterers.KMeansPlusPlus; -import edu.uc.rphash.tests.clusterers.Kmeans; + import edu.uc.rphash.tests.clusterers.MultiKMPP; import edu.uc.rphash.tests.clusterers.DBScan; public interface RPHashObject { final static int DEFAULT_NUM_PROJECTIONS = 1; public final static int DEFAULT_NUM_BLUR = 1; - final static long DEFAULT_NUM_RANDOM_SEED = 38006359550206753L; + final static long DEFAULT_NUM_RANDOM_SEED = 3800635955020675334L; final static int DEFAULT_NUM_DECODER_MULTIPLIER = 1; final static long DEFAULT_HASH_MODULUS = Long.MAX_VALUE; final static Decoder DEFAULT_INNER_DECODER = new Spherical(32,4,1);//new DepthProbingLSH(24);//new Leech();//new Spherical(16,2,2);//new MultiDecoder(24, new E8(1f));//new Golay();//new Spherical(64,2,1);//new Leech(3);//new PsdLSH();// @@ -41,8 +42,11 @@ public interface RPHashObject { //final static Clusterer DEFAULT_OFFLINE_CLUSTERER = new MultiKMPP(); - final static Projector DEFAULT_PROJECTOR = new DBFriendlyProjection(); + final static Projector DEFAULT_PROJECTOR = new DBFriendlyProjection(); + //final static Projector DEFAULT_PROJECTOR = new GaussianProjection(); + + int getdim(); Iterator getVectorIterator(); @@ -88,6 +92,13 @@ public interface RPHashObject { void setDimparameter(int parseInt); int getDimparameter(); + void setCutoff(int parseInt); + int getCutoff(); + + void setRandomVector(boolean parseBoolean); + boolean getRandomVector(); + + // void setOfflineClusterer(Clusterer agglomerative3); // Clusterer getOfflineClusterer(); @@ -97,7 +108,7 @@ public interface RPHashObject { int getk(); - void setK(int getk); + void setK(int k); String toString(); void reset();//TODO rename to resetDataStream diff --git a/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java b/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java index eaf3c2f..322ab22 100644 --- a/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java +++ b/src/main/java/edu/uc/rphash/Readers/SimpleArrayReader.java @@ -33,6 +33,11 @@ public class SimpleArrayReader implements RPHashObject { private Clusterer clusterer; private boolean normalize = false; private Projector projector; + + boolean RandomVector = false; + int Cutoff; + + public void setRandomSeed(long randomSeed) { this.randomSeed = randomSeed; @@ -94,6 +99,37 @@ public SimpleArrayReader(List X, int k) { // topIDs.add((long) 0); } + public SimpleArrayReader(List X) { + + this.randomSeed = new Random().nextLong(); + this.hashmod = DEFAULT_HASH_MODULUS; + this.decoderMultiplier = DEFAULT_NUM_DECODER_MULTIPLIER; + if(this.decoderMultiplier>1) + this.dec = new MultiDecoder(this.decoderMultiplier*DEFAULT_INNER_DECODER.getDimensionality(),DEFAULT_INNER_DECODER); + else + this.dec = DEFAULT_INNER_DECODER; + this.numProjections = DEFAULT_NUM_PROJECTIONS; + this.numBlur = DEFAULT_NUM_BLUR; + this.data = X; + if(data!=null) + this.dim = data.get(0).length; + else + this.dim = null; + // this.k = k; + this.centroids = new ArrayList(); + this.topIDs = new ArrayList(); + this.decayrate = 0; + this.dimparameter = DEFAULT_DIM_PARAMETER; + this.clusterer = DEFAULT_OFFLINE_CLUSTERER; + this.projector = DEFAULT_PROJECTOR; +// for (int i = 0; i < k; i++) +// topIDs.add((long) 0); + } + + + + + // public SimpleArrayReader(List X, int k, int blur) { // // this.randomSeed = DEFAULT_NUM_RANDOM_SEED; @@ -397,4 +433,31 @@ public void setProjectionType(Projector dbFriendlyProjection) { public Projector getProjectionType(){ return this.projector; } + + + + @Override + public void setCutoff(int parseInt) { + this.Cutoff = parseInt; + + } + + @Override + public int getCutoff() { + + return this.Cutoff; + } + + + + @Override + public void setRandomVector(boolean parseBoolean) { + this.RandomVector = parseBoolean; + } + public boolean getRandomVector() { + return this.RandomVector; + } + + + } diff --git a/src/main/java/edu/uc/rphash/Readers/StreamObject.java b/src/main/java/edu/uc/rphash/Readers/StreamObject.java index 460070f..428a891 100644 --- a/src/main/java/edu/uc/rphash/Readers/StreamObject.java +++ b/src/main/java/edu/uc/rphash/Readers/StreamObject.java @@ -41,6 +41,9 @@ public class StreamObject implements RPHashObject, Iterator { Decoder dec; float decayrate=0; boolean parallel = true; + boolean RandomVector; + int Cutoff; + ExecutorService executor; InputStream inputStream; @@ -425,4 +428,25 @@ public void setProjectionType(Projector dbFriendlyProjection) { public Projector getProjectionType() { return this.projector; } + + + + @Override + public void setCutoff(int parseInt) { + this.Cutoff = parseInt; + } + @Override + public int getCutoff() { + return this.Cutoff; + } + + + @Override + public void setRandomVector(boolean parseBoolean) { + this.RandomVector = parseBoolean; + } + public boolean getRandomVector() { + return this.RandomVector; + } + } diff --git a/src/main/java/edu/uc/rphash/TWRP1.java b/src/main/java/edu/uc/rphash/TWRP1.java new file mode 100644 index 0000000..7a9f923 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRP1.java @@ -0,0 +1,754 @@ +package edu.uc.rphash; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; +import java.util.Map; + + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; + + + +public class TWRP1 implements Clusterer, Runnable { + + boolean znorm = false; + + + private int counter; + private float[] rngvec; + private List centroids = null; + + + private RPHashObject so; + + public TWRP1(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run2(); + return centroids; + } + + + + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + + //float[] rngvec; the range vector is moot if incoming data has been normalized + //post normalization it should all be zero centered, with variance 1 + + /* + * super simple hash algorithm, reminiscient of pstable lsh + */ + // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. + + public long hashvec(float[] xt, float[] x, + HashMap> IDAndCent, HashMap> IDAndLabel,int ct) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) +// s += 1; + s= s+1; + + if (IDAndCent.containsKey(s)) { + IDAndLabel.get(s).add(ct); + IDAndCent.get(s).add(x); + } else { + List xlist = new ArrayList<>(); + xlist.add(x); + IDAndCent.put(s, xlist); + List idlist = new ArrayList<>(); + idlist.add(ct); + IDAndLabel.put(s, idlist); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap> IDAndCent,HashMap> IDandID,int ct) { + float[] xt = p.project(x); + +// counter++; +// for(int i = 0;i> IDAndCent,HashMap> IDandID,int ct,float[] mean,float[] variance) + { + float[] xt = p.project(StatTests.znormvec(x, mean, variance)); + +// counter++; +// for(int i = 0;i> findDensityModes() { + HashMap> IDAndCent = new HashMap<>(); + HashMap> IDAndID = new HashMap<>(); + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + + int ct = 0; +// if(znorm == true){ +// float[] variance = StatTests.varianceCol(so.getRawData()); +// float[] mean = StatTests.meanCols(so.getRawData()); +// // #process data by adding to the counter +// for (float[] x : so.getRawData()) +// { +// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); +// } +// } +// +// else + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, IDAndCent, IDAndID,ct++); + } + } + + + for (Long name: IDAndCent.keySet()){ + + String key =name.toString(); + System.out.println(key ); + + // String value = IDAndCent.get(name).toString() ; + // String value1 = Arrays.toString(value.toString()); + + // System.out.println(key + " " + value); + + +} + + for (Long name: IDAndID.keySet()){ + + // String key =name.toString(); + // String value = IDAndID.get(name).toString(); + // System.out.println(key + " " + value); + + +} + + // we would compress the hashmaps. SetOfIDandCount has the ids and the counts corresponding to that id. + // we have two hashmaps: 1. IDAndCent and 2. IDAndID. we will use IDAndCent + + + HashMap MapOfIDAndCount = new HashMap(); + + HashMap MapOfIDAndCent = new HashMap(); + + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + int cur_count = IDAndCent.get(cur_id).size(); + + MapOfIDAndCount.put(cur_id, (long) cur_count); // this has the hashids and counts. + + List bucketpoints = new ArrayList<>(); + + Iterator e = IDAndCent.get(cur_id).iterator(); + + // int i=1; + while (e.hasNext()) { + + // System.out.println(i++); + + bucketpoints.add(e.next()) ; + + } + + float [] bucketcent; + + bucketcent = medoid(bucketpoints); + + MapOfIDAndCent.put(cur_id, bucketcent); // this has the hashids and centroids. + + // System.out.println(cur_id + " " + cur_count); + + // int c = MapOfIDAndCent.get(cur_id).length; + + // System.out.println(cur_id + " " + c); + + + } + + + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + HashMap denseSetOfIDandCount = new HashMap(); + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = IDAndCent.get(cur_id).size(); + long parent_id = cur_id>>>1; + int parent_count = IDAndCent.get(parent_id).size(); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount.put(parent_id, 0L); + IDAndCent.put(parent_id, new ArrayList<>()); + denseSetOfIDandCount.put(cur_id, (long) cur_count); + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount.remove(parent_id); + IDAndCent.put(parent_id, new ArrayList<>()); + denseSetOfIDandCount.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + //HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + //remove keys with support less than 1 + Stream> stream = denseSetOfIDandCount.entrySet().stream().filter(p -> p.getValue() > 1); + + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + + List sortedIDList= new ArrayList<>(); + // sort and limit the list + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList.add(x.getKey())); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + + // compute centroids + + HashMap> estcents = new HashMap<>(); + for (int i =0; i KeyAndCent = new HashMap<>(); + HashMap KeyAndCount = new HashMap<>(); + HashMap WeightAndCent = new HashMap<>(); + + for (int i =0; i(estcents.values()); + } + + + + + + public HashMap findDensityModes2() { + HashMap> IDAndCent = new HashMap<>(); + HashMap> IDAndID = new HashMap<>(); + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + + int ct = 0; +// if(znorm == true){ +// float[] variance = StatTests.varianceCol(so.getRawData()); +// float[] mean = StatTests.meanCols(so.getRawData()); +// // #process data by adding to the counter +// for (float[] x : so.getRawData()) +// { +// addtocounter(x, projector, IDAndCent,IDAndID,ct++,mean,variance); +// } +// } +// +// else + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, IDAndCent, IDAndID,ct++); + } + } + + + for (Long name: IDAndCent.keySet()){ + + String key =name.toString(); + System.out.println(key ); + + // String value = IDAndCent.get(name).toString() ; +// String value1 = Arrays.toString(value.toString()); + +// System.out.println(key + " " + value); + + +} + + for (Long name: IDAndID.keySet()){ + +// String key =name.toString(); +// String value = IDAndID.get(name).toString(); +// System.out.println(key + " " + value); + + +} + + // we would compress the hashmaps. SetOfIDandCount has the ids and the counts corresponding to that id. + // we have two hashmaps: 1. IDAndCent and 2. IDAndID. we will use IDAndCent + + + HashMap MapOfIDAndCount = new HashMap(); + + HashMap MapOfIDAndCent = new HashMap(); + + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + int cur_count = IDAndCent.get(cur_id).size(); + + MapOfIDAndCount.put(cur_id, (long) cur_count); // this has the hashids and counts. + + List bucketpoints = new ArrayList<>(); + + Iterator e = IDAndCent.get(cur_id).iterator(); + +// int i=1; + while (e.hasNext()) { + +// System.out.println(i++); + + bucketpoints.add(e.next()) ; + + } + + float [] bucketcent; + + bucketcent = medoid(bucketpoints); + + MapOfIDAndCent.put(cur_id, bucketcent); // this has the hashids and centroids. + +// System.out.println(cur_id + " " + cur_count); + + // int c = MapOfIDAndCent.get(cur_id).length; + + // System.out.println(cur_id + " " + c); + + + } + + + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + HashMap denseSetOfIDandCount = new HashMap(); + for (Long cur_id : new TreeSet(IDAndCent.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = IDAndCent.get(cur_id).size(); + long parent_id = cur_id>>>1; + int parent_count = IDAndCent.get(parent_id).size(); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount.put(parent_id, 0L); + IDAndCent.put(parent_id, new ArrayList<>()); + denseSetOfIDandCount.put(cur_id, (long) cur_count); + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount.remove(parent_id); + IDAndCent.put(parent_id, new ArrayList<>()); + denseSetOfIDandCount.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + +//HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + //remove keys with support less than 1 + Stream> stream = denseSetOfIDandCount.entrySet().stream().filter(p -> p.getValue() > 1); + + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + + List sortedIDList= new ArrayList<>(); + // sort and limit the list + stream.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList.add(x.getKey())); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(so.getk()*4) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + + // compute centroids + + HashMap> estcents = new HashMap<>(); + for (int i =0; i KeyAndCent = new HashMap<>(); + HashMap KeyAndCount = new HashMap<>(); + HashMap WeightAndCent = new HashMap<>(); + + for (int i =0; i(estcents.values()); + + return WeightAndCent; + + +} + + + + + public void run() { + rngvec = new float[so.getDimparameter()]; + counter = 0; + Random r = new Random(so.getRandomSeed()); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + List> clustermembers = findDensityModes(); + Listcentroids = new ArrayList<>(); + + List weights =new ArrayList<>(); + int k = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); + for(int i=0;i WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + int NumberOfMicroClusters = WeightAndClusters.size() ; + + + int k = NumberOfMicroClusters>200+so.getk()?200+so.getk():NumberOfMicroClusters; + + + // have to prune depending NumberOfMicroClusters returned. + + for (Long weights : new TreeSet(WeightAndClusters.keySet())) + + { + weights2.add((float)weights); + centroids2.add(WeightAndClusters.get(weights)); + } + + + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + + } + + + + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 6;//6; + int d = 100;//16; + int n = 5000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + for (float f = var; f < 1.51; f += 1.5f) { + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + for (int i = 0; i < count; i++) { + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + // gen.writeCSVToFile(new + // File("/home/lee/Desktop/reclsh/in.csv")); + RPHashObject o = new SimpleArrayReader(gen.data, k); + o.setDimparameter(8); + TWRP1 rphit = new TWRP1(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + } + System.out.printf("%.0f\n", avgrealwcss / count); + + } + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } +} diff --git a/src/main/java/edu/uc/rphash/TWRPv2.java b/src/main/java/edu/uc/rphash/TWRPv2.java new file mode 100644 index 0000000..7a025b0 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv2.java @@ -0,0 +1,550 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv2 implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private List centroids = null; + + private RPHashObject so; + + public TWRPv2(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float[][] ret = new float[2][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + } + + //float[] rngvec; the range vector is moot if incoming data has been normalized + //post normalization it should all be zero centered, with variance 1 + /* + * super simple hash algorithm, reminiscient of pstable lsh + */ + // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. + +// public long hashvec(float[] xt, float[] x, +// HashMap> IDAndCent, HashMap> IDAndLabel,int ct) { +// long s = 1; //fixes leading 0's bug +// for (int i = 0; i < xt.length; i++) { +//// s <<= 1; +// s = s << 1 ; // left shift the bits of s by 1. +// if (xt[i] > rngvec[i]) +//// s += 1; +// s= s+1; +// +// if (IDAndCent.containsKey(s)) { +// IDAndLabel.get(s).add(ct); +// IDAndCent.get(s).add(x); +// } else { +// List xlist = new ArrayList<>(); +// xlist.add(x); +// IDAndCent.put(s, xlist); +// List idlist = new ArrayList<>(); +// idlist.add(ct); +// IDAndLabel.put(s, idlist); +// } +// } +// return s; +// } + + public long hashvec2(float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount,int ct) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, CountForIncomingVector, IncomingVector ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct) { + float[] xt = p.project(x); + +// counter++; +// for(int i = 0;i IDAndCent,HashMap IDandID,int ct,float[] mean,float[] variance) + { + float[] xt = p.project(StatTests.znormvec(x, mean, variance)); + +// counter++; +// for(int i = 0;i findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; +// if(znorm == true){ +// float[] variance = StatTests.varianceCol(so.getRawData()); +// float[] mean = StatTests.meanCols(so.getRawData()); +// // #process data by adding to the counter +// for (float[] x : so.getRawData()) +// { +// addtocounter(x, projector, MapOfIDAndCent,MapOfIDAndCount,ct++,mean,variance); +// } +// } +// +// else + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent, MapOfIDAndCount,ct++); + } + } + + +// for (Long name: MapOfIDAndCent.keySet()){ + +// String key =name.toString(); +// System.out.println(key); + + // String value = IDAndCent.get(name).toString() ; +// String value1 = Arrays.toString(value.toString()); +// System.out.println(key + " " + value); + +//} + +// for (Long name: MapOfIDAndCount.keySet()){ + +// String key =name.toString(); +// String value = IDAndID.get(name).toString(); +// System.out.println(key + " " + value); + +//} + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + +//HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + +// Stream> stream3 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); +// long counter= stream3.count(); +// System.out.println("NumberOfMicroClustersAfterPruning&limit_the_1s = "+ counter); + +// int cutoff= so.getk()*8; +// if (so.getk()*6 < 210) { cutoff=210+so.getk();} else { cutoff = so.getk()*8;} +// int cutoff = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); +// System.out.println("Cutoff = "+ cutoff); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + +// HashMap KeyAndCent = new HashMap<>(); +// HashMap KeyAndCount = new HashMap<>(); +// Map WeightAndCent = new HashMap<>(); +// Map WeightAndCent = new LinkedHashMap<>(); + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + +// for (int i =0; i data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(20); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); + System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv2 rphit = new TWRPv2(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv3.java b/src/main/java/edu/uc/rphash/TWRPv3.java new file mode 100644 index 0000000..b031fcd --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv3.java @@ -0,0 +1,606 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv3 implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv3(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + + // combines two hashmaps of idsandcents + + public static HashMap mergehmapsidsandcents( + HashMap partidandcent1, + HashMap partidandcent2, + HashMap partidandcount1, + HashMap partidandcount2) +{ + // new empty map + HashMap combined = new HashMap<>(); + combined.putAll( partidandcent1); + + for(Long key : partidandcent2.keySet()) { + if(combined.containsKey(key)) { + + + Long weight1= partidandcount1.get(key); + + float[] cent1= combined.get(key); + + Long weight2= partidandcount2.get(key); + + float [] cent2= partidandcent2.get(key); + + float [][] joined = UpdateHashMap(weight1, cent1 ,weight2 , cent2 ); + float combinedCount = joined[0][0]; + float [] combinedCent = joined[1]; + + + combined.put(key,combinedCent); + + } + else { + combined.put(key,partidandcent2.get(key)); + } + } + + return (combined); + +} + + + + // combines two hashmaps of idsandcounts + + public static HashMap mergehmapsidsandcounts(HashMap partidandcount1, + HashMap partidandcountt2) + { + + + HashMap combined = new HashMap (); // new empty map + combined.putAll(partidandcount1); + + + + for(Long key : partidandcountt2.keySet()) { + if(combined.containsKey(key)) { + + + long value1 = combined.get(key); + long value2 = partidandcountt2.get(key); + long value3 = value1 + value2; + + combined.put(key,value3); + + + } + else { + combined.put(key,partidandcountt2.get(key)); + } + + + + } + return (combined); + } + + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float[][] ret = new float[2][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount,int ct, float[] rngvec) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, CountForIncomingVector, IncomingVector ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID,ct,rngvec ); + } + + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct,float[] mean,float[] variance, float[] rngvec ) + { + float[] xt = p.project(StatTests.znormvec(x, mean, variance)); + + + hashvec2(xt,x,IDAndCent, IDandID,ct, rngvec); + } + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3); + + + } + } + + + MapOfIDAndCount=mergehmapsidsandcounts(MapOfIDAndCount1, MapOfIDAndCount2); + + MapOfIDAndCent = mergehmapsidsandcents(MapOfIDAndCent1,MapOfIDAndCent2,MapOfIDAndCount1, MapOfIDAndCount2 ); + + MapOfIDAndCent = mergehmapsidsandcents(MapOfIDAndCent,MapOfIDAndCent3,MapOfIDAndCount, MapOfIDAndCount3 ); + + MapOfIDAndCount=mergehmapsidsandcounts(MapOfIDAndCount, MapOfIDAndCount3); + + + + + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + +//HashMap> IDAndCent = new HashMap<>(); and HashMap MapOfIDAndCent = new HashMap(); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + //64 so 6 bits? + //stream = stream.filter(p -> p.getKey() > 64); + +// Stream> stream3 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); +// long counter= stream3.count(); +// System.out.println("NumberOfMicroClustersAfterPruning&limit_the_1s = "+ counter); + +// int cutoff= so.getk()*8; +// if (so.getk()*6 < 210) { cutoff=210+so.getk();} else { cutoff = so.getk()*8;} +// int cutoff = clustermembers.size()>200+so.getk()?200+so.getk():clustermembers.size(); +// System.out.println("Cutoff = "+ cutoff); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + +// HashMap KeyAndCent = new HashMap<>(); +// HashMap KeyAndCount = new HashMap<>(); +// Map WeightAndCent = new HashMap<>(); +// Map WeightAndCent = new LinkedHashMap<>(); + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + +// + for (Long keys: sortedIDList2) + + { +// WeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + +// KeyAndCent.put(keys, MapOfIDAndCent.get(keys)); +// KeyAndCount.put(keys, MapOfIDAndCount.get(keys)); + + } + + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + + + Multimap WeightAndClusters = findDensityModes2(); + //Map WeightAndClusters = findDensityModes2(); + + + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + // int k = NumberOfMicroClusters>200+so.getk()?200+so.getk():NumberOfMicroClusters; + + // have to prune depending NumberOfMicroClusters returned. + // int i = 1; + // int j=1; + // for (Long weights : new TreeSet(WeightAndClusters.keySet())) + for (Long weights : WeightAndClusters.keys()) + { + // System.out.println("NumberOfTreesetkeys = "+ i); + // String key =weights.toString(); + // System.out.println(weights); + weights2.add((float)weights); + // centroids2.add(WeightAndClusters.get(weights)); + // centroids2.addAll(WeightAndClusters.get(weights)); + // i=i+1; + } + // System.out.println("done printing keys for weights"); + + for (Long weight : WeightAndClusters.keySet()) + + { + // System.out.println(weight); + // System.out.println("NumberOfTreesetkeys = "+ j); + centroids2.addAll(WeightAndClusters.get(weight)); + + // j=j+1; + } + // System.out.println("done printing keys for centroids"); + + // System.out.println(weights2.size()); + // System.out.println(centroids2.size()); + + //System.out.printf("\tvalueofK is "); + //System.out.println( so.getk()); + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 700;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(20); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv3 rphit = new TWRPv3(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv4.java b/src/main/java/edu/uc/rphash/TWRPv4.java new file mode 100644 index 0000000..06bb22d --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv4.java @@ -0,0 +1,484 @@ +package edu.uc.rphash; + + +/* This class uses the bisection vector as the lsh partition vector */ +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + +public class TWRPv4 implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private List centroids = null; + private float[] bisectionVector; + + private RPHashObject so; + + public TWRPv4(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + float[] var_r1 = new float[x_1.length]; + float[] var_r2 = new float[x_1.length]; + + double var1=0; + double var2=0; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; + + var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; + + } + + for (int i = 0; i < var_r1.length; i++) { + var1 = var1 + var_r1[i]; + + var2 = var2 + var_r2[i]; + } + + + // System.out.println("wcsse = " + wcsse); + + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + return ret; + } + + + public static float[][] UpdateHashMap_actual(float cnt_1, float[] x_1, + float cnt_2, float[] x_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float[][] ret = new float[2][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + } + + //float[] rngvec; the range vector is moot if incoming data has been normalized + //post normalization it should all be zero centered, with variance 1 + /* + * super simple hash algorithm, reminiscient of pstable lsh + */ + // xt is the projected vector and x is the original vector , rngvec is the randomly generated vector of projected dim. + + + public long hashvec2(float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount,int ct, float[] bisectionVector) { + +// for (int i=0 ; i data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(16); + + o.setCutoff(100); + o.setRandomVector(true); +// System.out.println("cutoff = "+ o.getCutoff()); + System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv4 rphit = new TWRPv4(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java new file mode 100644 index 0000000..a62b740 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv5_WCSS.java @@ -0,0 +1,601 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv5_WCSS implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv5_WCSS(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + + // This function returns the square of the euclidean distance. + public static float distance(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weigths are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + float[] var_r1 = new float[x_1.length]; + float[] var_r2 = new float[x_1.length]; + + double var1=0; + double var2=0; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + var_r1[i] = ((-x_r[i] + x_1[i]) * (-x_r[i] + x_1[i]))/1000000000; + + var_r2[i] =(((-x_r[i] + x_2[i]) * (-x_r[i] + x_2[i])))/1000000000; + + + } + + for (int i = 0; i < var_r1.length; i++) { + var1 = var1 + var_r1[i]; + + var2 = var2 + var_r2[i]; + } + double wcsse=0; + wcsse = ( cnt_1*(wcss_1*wcss_1 + (var1)) + var2 / (cnt_1 + cnt_2 ) ) ; + + // System.out.println("wcsse = " + wcsse); + + float wcss = (float) wcsse; + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + public static float[][] UpdateHashMap_actual(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = distance(x_r,x_2) + wcss_1; + + +// float wcss = ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_1); + +// float wcss = ( ((wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_r) ); + +// float dissq= distance(x_1,x_2); +// float wcss = wcss_1 + dissq - (dissq/cnt_r) ; + + + +// System.out.println("wcss = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + + float[][] MergedValues = UpdateHashMap_actual(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + + } + } + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + + for (Long cur_id : (MapOfIDAndWCSS1.keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(cur_id);} + + for (Long cur_id : (MapOfIDAndWCSS2.keySet())) + + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(cur_id);} + + for (Long cur_id : (MapOfIDAndWCSS3.keySet())) + + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(cur_id);} + + System.out.println("wcss1 = " + WCSS1); + System.out.println("wcss2 = " + WCSS2); + System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + System.out.println("winner = tree1"); + } + else if ((WCSS2 <= WCSS1) && (WCSS2 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + System.out.println("winner = tree3"); + + } + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + HashMap denseSetOfIDandCount2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(16); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv5_WCSS rphit = new TWRPv5_WCSS(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_COV.java b/src/main/java/edu/uc/rphash/TWRPv6_COV.java new file mode 100644 index 0000000..ecfce02 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_COV.java @@ -0,0 +1,653 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv6_COV implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_COV(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + + float wcss_cov = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); +// wcss_cov = wcss_cov/; +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss_cov; +// ret[3][0]= distance; + return ret; + + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + + } + } + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + System.out.println("wcss1 = " + WCSS1); + System.out.println("wcss2 = " + WCSS2); + System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(16); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_WCSS2 rphit = new TWRPv6_WCSS2(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java b/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java new file mode 100644 index 0000000..53c2de4 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_WCSS2.java @@ -0,0 +1,664 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv6_WCSS2 implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_WCSS2(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + +// This function returns the square of the euclidean distance. + public static float distance(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { // incoming vector + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + +// float wcss = ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_1); + +// float wcss = ( ((wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_r) ); + + float dissq= distance(x_1,x_2); + float wcss = wcss_1 + dissq - (dissq/cnt_r) ; + + + + + + + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + + } + } + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + System.out.println("wcss1 = " + WCSS1); + System.out.println("wcss2 = " + WCSS2); + System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 100;//16; + int n = 1000; + float var = 1f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + GenerateData gen = new GenerateData(k, n/k, d, f, true, 1f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + RPHashObject o = new SimpleArrayReader(gen.data, k); + + o.setDimparameter(16); + + o.setCutoff(100); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_WCSS2 rphit = new TWRPv6_WCSS2(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + + avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(), + gen.getData()); + +// VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java b/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java new file mode 100644 index 0000000..9196e44 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_meanVariance.java @@ -0,0 +1,668 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +public class TWRPv6_meanVariance implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_meanVariance(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + +// float wcss = (distance(x_r,x_2)/cnt_r) + wcss_1; + +// float wcss = ( ( cnt_1*(wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ) / (cnt_1); + +// float wcss = ( ((wcss_1 + distance(x_r,x_1)) ) + distance(x_r,x_2) ); + + float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); + + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + + } + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + + } + } + + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + System.out.println("wcss1 = " + WCSS1); + System.out.println("wcss2 = " + WCSS2); + System.out.println("wcss3 = " + WCSS3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); + + aggloOffline.setWeights(weights2); + + this.centroids = aggloOffline.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/1D.txt", raw); + k = 6; + RPHashObject o = new SimpleArrayReader(data, 6); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_meanVariance rphit = new TWRPv6_meanVariance(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + + + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline.java new file mode 100644 index 0000000..e50785f --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline.java @@ -0,0 +1,775 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 3 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which hass the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + /* + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } +// float wcss = (distancesq(x_r,x_2)/cnt_r) + wcss_1; + +// float wcss = ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_1); + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + +// this method is used to calculate the offline wcss +// UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); + + public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { + + float wcss = wcss_1 + distancesq(x_1,x_2); + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; +// ret[0][0] = cnt_r; +// ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + +// this hash is to calculate the wcss +// hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); + + public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCent [] = MapOfIDAndCent.get(s); + float IncomingVector [] = x; + + + float currentWcss= 0; + + if (IDandWCSS_offline.containsKey(s)) { + currentWcss= IDandWCSS_offline.get(s); + } + + float[][] MergedValues = UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector ); + + float wcss= MergedValues[2][0]; + + + IDandWCSS_offline.put(s, wcss); + + + } + } + return s; + } + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + // this method is used to compute the offline WCSS to choose the best of the clusters + //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); + + void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { + + float[] xt = p.project(x); + + hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); + + } + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent1.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + float WCSS_off_1 = 0; + float WCSS_off_2 = 0; + float WCSS_off_3 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); + + // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets + // and calculate the wcss as we know their centroids : + + + for (float[] x : so.getRawData()) + { + + calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS_offline_1); + calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); + calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); + + } + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + +//* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS_off_1 = WCSS_off_1 + MapOfIDAandWCSS_offline_1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS_off_2 = WCSS_off_2 + MapOfIDAandWCSS_offline_2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} + + + + System.out.print("wcss1 = " + WCSS1); + System.out.println(" wcss_ofline_1 = " + WCSS_off_1); + + System.out.print("wcss2 = " + WCSS2); + System.out.println(" wcss_ofline_2 = " + WCSS_off_2); + + System.out.print("wcss3 = " + WCSS3); + System.out.println(" wcss_ofline_3 = " + WCSS_off_3); + + + if ((WCSS_off_1 <= WCSS_off_2) && (WCSS_off_1 <= WCSS_off_3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS_off_2<= WCSS_off_1) && (WCSS_off_2<=WCSS_off_3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + + for (Long keys: sortedIDList2) + + { + + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + + } + + + return multimapWeightAndCent; + + + + + // this is to be taken out . only done for hypothesis testing. + + + + + +} + + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + + Multimap WeightAndClusters = findDensityModes2(); + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("NumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + + Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); + aggloOffline.setWeights(weights2); + this.centroids = aggloOffline.getCentroids(); + + +/* + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + this.centroids = aggloOffline2.getCentroids(); +*/ + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/OutputTwrpCents1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/1D.txt", raw); + k = 6; + RPHashObject o = new SimpleArrayReader(data, 6); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline rphit = new TWRPv6_wcss_offline(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + + System.gc(); + + System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java new file mode 100644 index 0000000..9138c3f --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST.java @@ -0,0 +1,1015 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; +import java.util.Collections; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; +import edu.uc.rphash.tests.clusterers.DBScan; +import edu.uc.rphash.tests.clusterers.MultiKMPP; + + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + +// https://www.javatips.net/api/webofneeds-master/webofneeds/won-matcher-solr/src/main/java/won/matcher/solr/utils/Kneedle.java +// https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java + +// this algorithm runs twrp 3 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST implements Clusterer, Runnable { + + + List labels; // to directly output labels + HashMap labelmap; // to directly output labels + public List getLabels() { + for (int i = 0; i < labels.size(); i++) { + if (labelmap.containsKey(labels.get(i))) { + labels.set(i, labelmap.get(labels.get(i))); + } else { + labels.set(i, -1l); + } + } + return this.labels; + } + + + + boolean znorm = false; + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float eps; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } +// float wcss = (distancesq(x_r,x_2)/cnt_r) + wcss_1; + +// float wcss = ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_1); + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + +// float wcss = ( ( ( cnt_1*(wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ) / (cnt_r) ); + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + +// this method is used to calculate the offline wcss +// UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); + +/* public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { + + float wcss = wcss_1 + distancesq(x_1,x_2); + +// System.out.println("wcsse = " + wcss); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; +// ret[0][0] = cnt_r; +// ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } +*/ + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + +// this hash is to calculate the wcss +// hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); + +/* public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCent [] = MapOfIDAndCent.get(s); + float IncomingVector [] = x; + + + float currentWcss= 0; + + if (IDandWCSS_offline.containsKey(s)) { + currentWcss= IDandWCSS_offline.get(s); + } + + float[][] MergedValues = UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector ); + + float wcss= MergedValues[2][0]; + + + IDandWCSS_offline.put(s, wcss); + + + } + } + return s; + } +*/ + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + // this method is used to compute the offline WCSS to choose the best of the clusters + //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); + +/* void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { + + float[] xt = p.project(x); + + hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); + + } +*/ + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + + public void printHashmap(HashMap hashmap) { + + System.out.println(hashmap.keySet()); + System.out.println(hashmap.values()); + + } +public void printStream(Stream> stream) { + + //System.out.println(hashmap.keySet()); + System.out.println(stream.count()); + +} +// this method calculates the epsilon value and prints the information. +public float printInfo(ListsetofKeys, HashMap MapOfIDAndCount, HashMap MapOfIDAndCent, HashMap MapOfIDAndWCSS) { + + List counts = new ArrayList<>(); + List wcsseprint = new ArrayList<>(); + float temp = 0; + int elements=0; + float avg=0; + + for (Long keys: setofKeys) + { + elements=elements+1; +//// System.out.println(MapOfIDAndCount.get(keys)); + counts.add(MapOfIDAndCount.get(keys)); + wcsseprint.add(MapOfIDAndWCSS.get(keys)); + + } +// System.out.println(); + System.out.print(counts); + +// for (Long keys: setofKeys) +// { +// System.out.println(MapOfIDAndWCSS.get(keys)); +// wcsseprint.add(MapOfIDAndWCSS.get(keys)); +// } + + // calculation of epsilon + /* + for (int i=0 ; i<(0.8*elements); i++) //for (int i=0 ; i<(0.8*elements); i++) + { + temp = temp + (wcsseprint.get(i))/(counts.get(i)); + } + avg = (float) (temp/(0.8*elements)); + System.out.println(); + System.out.println("\taverage epsilon = "+ avg); + */ + Collections.sort(wcsseprint); + Collections.reverse(wcsseprint); + System.out.println(); + System.out.println(wcsseprint); + System.out.println(); + + return (avg); + } + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + //public Map findDensityModes2() { + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + + projector.setRandomSeed(so.getRandomSeed()); + //projector.setRandomSeed(949124732); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + + } + } + + System.out.println("\nNumberOfMicroClustersBeforePruning = , "+ MapOfIDAndCent1.size()); + //printHashmap(MapOfIDAndCount1); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + // MapOfIDAndCount1.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent1.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent2.put(parent_id, new float[]{}); + // MapOfIDAndCount2.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + // IDAndCent.put(parent_id, new ArrayList<>()); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + // IDAndCent.put(parent_id, new ArrayList<>()); + MapOfIDAndCent3.put(parent_id, new float[]{}); + // MapOfIDAndCount.put(parent_id, new Long (0)); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + // printHashmap(denseSetOfIDandCount2_1); + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + //printHashmap(denseSetOfIDandCount2_2); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + //printHashmap(denseSetOfIDandCount2_3); + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + + HashMap denseSetOfIDandCount2 = new HashMap(); + + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + +/* float WCSS_off_1 = 0; +// float WCSS_off_2 = 0; +// float WCSS_off_3 = 0; + +// HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); +// HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); +// HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); + + // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets + // and calculate the wcss as we know their centroids : + + +// for (float[] x : so.getRawData()) +// { + +// calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS_offline_1); +// calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); +// calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); + +// } +*/ + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + +/* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: + for (Long keys: sortedIDList2_1) +// for (Long cur_id : (((HashMap) stream2_1).keySet())) + { // System.out.println("wcss1 = " + MapOfIDAndWCSS1.get(cur_id)); + WCSS_off_1 = WCSS_off_1 + MapOfIDAandWCSS_offline_1.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_2.keySet())) + for (Long keys: sortedIDList2_2) + { WCSS_off_2 = WCSS_off_2 + MapOfIDAandWCSS_offline_2.get(keys);} + +// for (Long cur_id : (denseSetOfIDandCount2_3.keySet())) + for (Long keys: sortedIDList2_3) + { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} +*/ + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS1); +// System.out.println(" wcss_ofline_calc_1 = " + WCSS_off_1); + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS2); +// System.out.println(" wcss_ofline_calc_2 = " + WCSS_off_2); + + System.out.println("wcss1(online calc) of candidate cents = , " + WCSS3); +// System.out.println(" wcss_ofline_calc_3 = " + WCSS_off_3); + + if ((WCSS1 <= WCSS2) && (WCSS1 <= WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + else if ((WCSS2<= WCSS1) && (WCSS2<=WCSS3)) + {MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + else + {MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + + } + + System.out.println("NumberOfMicroClusters_AfterPruning_&_beforesortingLimit = , "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 2); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + System.out.println("------------------------------------------------------------------------------------------------------------------"); + //printHashmap(denseSetOfIDandCount2); + float eps= printInfo(sortedIDList2,denseSetOfIDandCount2, MapOfIDAndCent,MapOfIDAndWCSS); +// seteps(eps); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + +/* + // this is to be taken out . only done for hypothesis testing. computing wcss for all the 3 trees. begin: + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + try { + // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; /C:/Users/deysn/Documents/temp/covtype/1D.txt + // "C:/Users/deysn/Desktop/pd_backup/16gb/data_nick2/dim100/1D.txt" + // "C:/Users/deysn/Desktop/temp/dim600/1D.txt" + // "/C:/Users/deysn/Desktop/temp/run_results/3runs/1000noise10/1D.txt" + data = VectorUtil.readFile("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/covtype/covtype_5clus_1D.csv", raw); + + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + Multimap multimapWeightAndCent1 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_1) + { + multimapWeightAndCent1.put((Long)(MapOfIDAndCount1.get(keys)), (float[]) (MapOfIDAndCent1.get(keys))); + } + + Multimap multimapWeightAndCent2 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_2) + { + multimapWeightAndCent2.put((Long)(MapOfIDAndCount2.get(keys)), (float[]) (MapOfIDAndCent2.get(keys))); + } + + Multimap multimapWeightAndCent3 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_3) + { + multimapWeightAndCent3.put((Long)(MapOfIDAndCount3.get(keys)), (float[]) (MapOfIDAndCent3.get(keys))); + } + + Listcentroids1 = new ArrayList<>(); + List weights1 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent1.keys()) + { + weights1.add((float)weights); + } + + for (Long weight : multimapWeightAndCent1.keySet()) + + { + centroids1.addAll(multimapWeightAndCent1.get(weight)); + } + + Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids1, so.getk()); + aggloOffline.setWeights(weights1); + List finalcentroids_1 = aggloOffline.getCentroids(); + + +// KMeans2 Offline = new KMeans2(); +// Offline.setK(so.getk()); +// Offline.setRawData(centroids1); +// Offline.setWeights(weights1); +// List finalcentroids_1 = Offline.getCentroids(); + +// MultiKMPP aggloOffline3 = new MultiKMPP(centroids1,so.getk()); +// List finalcentroids_1 = aggloOffline3.getCentroids(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent2.keys()) + { + weights2.add((float)weights); + } + + for (Long weight : multimapWeightAndCent2.keySet()) + + { + centroids2.addAll(multimapWeightAndCent1.get(weight)); + } + + Agglomerative3 aggloOffline2 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); + aggloOffline2.setWeights(weights2); + List finalcentroids_2 = aggloOffline2.getCentroids(); + +// KMeans2 Offline2 = new KMeans2(); +// Offline2.setK(so.getk()); +// Offline2.setRawData(centroids2); +// Offline2.setWeights(weights2); +// List finalcentroids_2 = Offline2.getCentroids(); + + // MultiKMPP aggloOffline2 = new MultiKMPP(centroids2,so.getk()); + // List finalcentroids_2 = aggloOffline2.getCentroids(); + + Listcentroids3 = new ArrayList<>(); + List weights3 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent3.keys()) + { + weights3.add((float)weights); + } + + for (Long weight : multimapWeightAndCent3.keySet()) + + { + centroids3.addAll(multimapWeightAndCent3.get(weight)); + } + + Agglomerative3 aggloOffline3 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids3, so.getk()); + aggloOffline3.setWeights(weights3); + List finalcentroids_3 = aggloOffline3.getCentroids(); + +// KMeans2 Offline3 = new KMeans2(); +// Offline3.setK(so.getk()); +// Offline3.setRawData(centroids3); +// Offline3.setWeights(weights3); +// List finalcentroids_3 = Offline3.getCentroids(); + +// MultiKMPP Offline3 = new MultiKMPP(centroids3,so.getk()); +// List finalcentroids_3 = Offline3.getCentroids(); + + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/har_k6/OutputTwrpCents_tree1"),finalcentroids_1, false); + + System.out.printf("kemans for tree1 = "+"%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_1, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/har_k6/OutputTwrpCents_tree2"),finalcentroids_2, false); + + System.out.printf("kemans for tree2 = "+"%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_2, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/har_k6/OutputTwrpCents_tree3"),finalcentroids_3, false); + + System.out.printf("kemans for tree3 = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_3, data) ); + + // this is to be taken out . only done for hypothesis testing. computing wcss for all the 3 trees. END +*/ + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + + rngvec2 = new float[so.getDimparameter()]; + + rngvec3 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + + Random r = new Random(); + //Random r = new Random(923063597592675214L) ; + Random r2 = new Random(); + //Random r2 = new Random(923063597592675214L) ; + Random r3 = new Random(); + //Random r3 = new Random(923063597592675214L) ; + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) { + rngvec[i] = (float) r.nextGaussian(); + //System.out.println(rngvec[i]); + } + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + + } else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + System.out.println("\tNumberOfMicroClusters_AfterPruning = , "+ WeightAndClusters.size()); +// System.out.println("getRandomVector = "+ randVect); + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); +// aggloOffline2.setWeights(weights2); + this.centroids = aggloOffline2.getCentroids(); + +// MultiKMPP aggloOffline3 = new MultiKMPP(centroids2,so.getk()); +// this.centroids = aggloOffline3.getCentroids(); + +//// DBScan algo = new DBScan(centroids2, (eps/(20)), 3); +//// System.out.println("epsssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss = "+ eps/(20)); +//// this.centroids = algo.getCentroids(); +//// System.out.println("no. of final output centroids = "+ centroids.size()); + + } + + public static void main(String[] args) throws FileNotFoundException, + IOException, InterruptedException { + + System.gc(); + + // int k ; //= 10; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + // String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + + // float f = var; + // float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/C:/Users/deysn/Desktop/temp/run_results/3runs/rough/1D.txt")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + // "/C:/Users/deysn/Desktop/temp/har/1D.txt" ; C:/Users/deysn/Documents/temp/covtype/1D.txt + // C:/Users/dey.sn/Downloads/temp/covtype/1D.csv ; "C:/Users/dey.sn/Downloads/temp/run_results/3runs/har_k6/1D.txt" + String inputfile = "C:/Users/dey.sn/Downloads/temp/crop_mapping/1D.csv" ; + System.out.println(inputfile); + data = VectorUtil.readFile( inputfile , raw); + for (int k=4; k<=11;k++) + { + for (int i = 1; i <= 3; i++) + { + //k = 7; + + RPHashObject o = new SimpleArrayReader(data, k); + + o.setDimparameter(16); + o.setCutoff(130); //230 + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST rphit = new TWRPv6_wcss_offline2_TEST(o); + + System.gc(); + + Runtime rt = Runtime.getRuntime(); + rt.gc(); + Thread.sleep(10); + rt.gc(); + long startmemory = rt.totalMemory() - rt.freeMemory(); + long startTime = System.nanoTime(); + + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 1000000000f ; + + float usedMB = ((rt.totalMemory() - rt.freeMemory()) - startmemory) / (1024*1024); + + System.out.println(" Time(in sec), " + avgtime + ", Mem_Used(MB):, " + (usedMB/3) ); + + rt.gc(); + Thread.sleep(10); + rt.gc(); + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); +// String Output = "/C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/rnaseq_k4/OutputTwrpCents_dbscan" ; + String Output = "C:/Users/dey.sn/Downloads/work/output/cropmap_k7/cropmap_k7_kmeans_130_cutoff"+"_" +k+"_"+i+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// VectorUtil.writeVectorFile(new File(Output+"_"+"labels"+".txt"), centsr.getLabels()); + + +// System.out.printf("WCSS for generated data = "+ "%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf(",WCSS for Winning Kmeans, = , "+ "%.0f ", StatTests.WCSSECentroidsFloat(centsr, data)); + System.out.println(",k, is: , "+k); +// + System.gc(); + } + } + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + public void seteps(float eps) { + this.eps=eps; + } +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java new file mode 100644 index 0000000..4f9532b --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs.java @@ -0,0 +1,1305 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 10 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_10runs implements Clusterer, Runnable { + + boolean znorm = false; + int [] num_of_clusters_stage1 = new int[12]; + int min_k; + int max_k; + + // convert this to an array of arrays + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + private float[] rngvec6; + private float[] rngvec7; + private float[] rngvec8; + private float[] rngvec9; + private float[] rngvec10; + private float[] rngvec11; + private float[] rngvec12; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_10runs(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + HashMap MapOfIDAndCent6 = new HashMap<>(); + HashMap MapOfIDAndCount6 = new HashMap<>(); + HashMap MapOfIDAndWCSS6 = new HashMap<>(); + + HashMap MapOfIDAndCent7 = new HashMap<>(); + HashMap MapOfIDAndCount7 = new HashMap<>(); + HashMap MapOfIDAndWCSS7 = new HashMap<>(); + + HashMap MapOfIDAndCent8 = new HashMap<>(); + HashMap MapOfIDAndCount8 = new HashMap<>(); + HashMap MapOfIDAndWCSS8 = new HashMap<>(); + + HashMap MapOfIDAndCent9 = new HashMap<>(); + HashMap MapOfIDAndCount9 = new HashMap<>(); + HashMap MapOfIDAndWCSS9 = new HashMap<>(); + + HashMap MapOfIDAndCent10 = new HashMap<>(); + HashMap MapOfIDAndCount10 = new HashMap<>(); + HashMap MapOfIDAndWCSS10 = new HashMap<>(); + + HashMap MapOfIDAndCent11 = new HashMap<>(); + HashMap MapOfIDAndCount11 = new HashMap<>(); + HashMap MapOfIDAndWCSS11 = new HashMap<>(); + + HashMap MapOfIDAndCent12 = new HashMap<>(); + HashMap MapOfIDAndCount12 = new HashMap<>(); + HashMap MapOfIDAndWCSS12 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector.init(); + + // #create projector matrixs + Projector projector2 = so.getProjectionType(); + projector2.setOrigDim(so.getdim()); + projector2.setProjectedDim(so.getDimparameter()); + projector2.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector2.init(); + + // #create projector matrixs + Projector projector3 = so.getProjectionType(); + projector3.setOrigDim(so.getdim()); + projector3.setProjectedDim(so.getDimparameter()); + projector3.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector3.init(); + + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + + addtocounter(x, projector2, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + addtocounter(x, projector2, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector2, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector2, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + + addtocounter(x, projector3, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector3, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector3, MapOfIDAndCent11, MapOfIDAndCount11,ct++, rngvec11,MapOfIDAndWCSS11); + addtocounter(x, projector3, MapOfIDAndCent12, MapOfIDAndCount12,ct++, rngvec12,MapOfIDAndWCSS12); + + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_6 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount6.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount6.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount6.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_6.put(parent_id, 0L); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_6.remove(parent_id); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_7 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount7.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount7.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount7.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_7.put(parent_id, 0L); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_7.remove(parent_id); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_8 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount8.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount8.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount8.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_8.put(parent_id, 0L); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_8.remove(parent_id); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_9 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount9.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount9.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount9.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_9.put(parent_id, 0L); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_9.remove(parent_id); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_10 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount10.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount10.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount10.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_10.put(parent_id, 0L); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_10.remove(parent_id); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_11 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount11.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount11.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount11.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_11.put(parent_id, 0L); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_11.remove(parent_id); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_12 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount12.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount12.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount12.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_12.put(parent_id, 0L); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_12.remove(parent_id); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + } + } + } + } + } + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_6= new ArrayList<>(); + // sort and limit the list + stream2_6.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); + + + Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_7= new ArrayList<>(); + // sort and limit the list + stream2_7.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); + + + Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_8= new ArrayList<>(); + // sort and limit the list + stream2_8.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); + + + Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_9= new ArrayList<>(); + // sort and limit the list + stream2_9.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); + + + Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_10= new ArrayList<>(); + // sort and limit the list + stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); + + + Stream> stream2_11 = denseSetOfIDandCount2_11.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_11= new ArrayList<>(); + // sort and limit the list + stream2_11.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_11.add(x.getKey())); + + Stream> stream2_12 = denseSetOfIDandCount2_12.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_12= new ArrayList<>(); + // sort and limit the list + stream2_12.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_12.add(x.getKey())); + +// finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + num_of_clusters_stage1[0]= elbowcalculator.find_elbow(sortedIDList2_1); + num_of_clusters_stage1[1]= elbowcalculator.find_elbow(sortedIDList2_2); + num_of_clusters_stage1[2]= elbowcalculator.find_elbow(sortedIDList2_3); + num_of_clusters_stage1[3]= elbowcalculator.find_elbow(sortedIDList2_4); + num_of_clusters_stage1[4]= elbowcalculator.find_elbow(sortedIDList2_5); + num_of_clusters_stage1[5]= elbowcalculator.find_elbow(sortedIDList2_6); + num_of_clusters_stage1[6]= elbowcalculator.find_elbow(sortedIDList2_7); + num_of_clusters_stage1[7]= elbowcalculator.find_elbow(sortedIDList2_8); + num_of_clusters_stage1[8]= elbowcalculator.find_elbow(sortedIDList2_9); + num_of_clusters_stage1[9]= elbowcalculator.find_elbow(sortedIDList2_10); + num_of_clusters_stage1[10]= elbowcalculator.find_elbow(sortedIDList2_11); + num_of_clusters_stage1[11]= elbowcalculator.find_elbow(sortedIDList2_12); + + for (int i=0 ; i<12; i++) { + + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); + System.out.println("\n" + "No. of clusters_stage1 = " + num_of_clusters_stage1[i]); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); + System.out.println( "************************************************************" ); + sum_jt = sum_jt + num_of_clusters_stage1[i]; + } + System.out.println("\n" + "sum of No. of clusters_stage1 = " + sum_jt); + + double avg_clus_stage1 = Math.ceil(sum_jt / 12.0) ; + System.out.println("\n" + "Average of No. of clusters_stage1 = " + avg_clus_stage1 ); + System.out.println( "************************************************************" ); + // finding the range of K for offline clustering : + min_k = (int) (avg_clus_stage1 - 3) ; + + max_k = (int) (avg_clus_stage1 + 3) ; + + if (min_k < 3 ) { min_k = 3 ; max_k = 9; } ; + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; + float WCSS10 = 0; + float WCSS11 = 0; + float WCSS12 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + + for (Long keys: sortedIDList2_6){ + WCSS6 = WCSS6 + MapOfIDAndWCSS6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS7 = WCSS7 + MapOfIDAndWCSS7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS8 = WCSS8 + MapOfIDAndWCSS8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS9 = WCSS9 + MapOfIDAndWCSS9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} + + for (Long keys: sortedIDList2_11) + { WCSS11 = WCSS11 + MapOfIDAndWCSS11.get(keys);} + + for (Long keys: sortedIDList2_12) + { WCSS12 = WCSS12 + MapOfIDAndWCSS12.get(keys);} + + + + System.out.print(" wcss1 = " + WCSS1); + + System.out.print(" wcss2 = " + WCSS2); + + System.out.print(" wcss3 = " + WCSS3); + + System.out.print(" wcss4 = " + WCSS4); + + System.out.print(" wcss5 = " + WCSS5); + + System.out.print(" wcss6 = " + WCSS6); + + System.out.print(" wcss7 = " + WCSS7); + + System.out.print(" wcss8 = " + WCSS8); + + System.out.print(" wcss9 = " + WCSS9); + + System.out.print(" wcss10 = " + WCSS10); + + System.out.print(" wcss11 = " + WCSS11); + + System.out.print(" wcss12 = " + WCSS12); + +// float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + float arr[] = {WCSS1,WCSS2,WCSS3,WCSS4,WCSS5,WCSS6,WCSS7,WCSS8,WCSS9,WCSS10, WCSS11, WCSS12 }; + + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + if (index_of_max == 5) { + MapOfIDAndCount = MapOfIDAndCount6; + MapOfIDAndCent = MapOfIDAndCent6; + MapOfIDAndWCSS = MapOfIDAndWCSS6; + denseSetOfIDandCount2 = denseSetOfIDandCount2_6; + System.out.println("winner = tree6"); + } + if (index_of_max == 6) { + MapOfIDAndCount = MapOfIDAndCount7; + MapOfIDAndCent = MapOfIDAndCent7; + MapOfIDAndWCSS = MapOfIDAndWCSS7; + denseSetOfIDandCount2 = denseSetOfIDandCount2_7; + System.out.println("winner = tree7"); + } + if (index_of_max == 7) { + MapOfIDAndCount = MapOfIDAndCount8; + MapOfIDAndCent = MapOfIDAndCent8; + MapOfIDAndWCSS = MapOfIDAndWCSS8; + denseSetOfIDandCount2 = denseSetOfIDandCount2_8; + System.out.println("winner = tree8"); + } + if (index_of_max == 8) { + MapOfIDAndCount = MapOfIDAndCount9; + MapOfIDAndCent = MapOfIDAndCent9; + MapOfIDAndWCSS = MapOfIDAndWCSS9; + denseSetOfIDandCount2 = denseSetOfIDandCount2_9; + System.out.println("winner = tree9"); + } + if (index_of_max == 9) { + MapOfIDAndCount = MapOfIDAndCount10; + MapOfIDAndCent = MapOfIDAndCent10; + MapOfIDAndWCSS = MapOfIDAndWCSS10; + denseSetOfIDandCount2 = denseSetOfIDandCount2_10; + System.out.println("winner = tree10"); + } + if (index_of_max == 10) { + MapOfIDAndCount = MapOfIDAndCount11; + MapOfIDAndCent = MapOfIDAndCent11; + MapOfIDAndWCSS = MapOfIDAndWCSS11; + denseSetOfIDandCount2 = denseSetOfIDandCount2_11; + System.out.println("winner = tree11"); + } + if (index_of_max == 11) { + MapOfIDAndCount = MapOfIDAndCount12; + MapOfIDAndCent = MapOfIDAndCent12; + MapOfIDAndWCSS = MapOfIDAndWCSS12; + denseSetOfIDandCount2 = denseSetOfIDandCount2_12; + System.out.println("winner = tree12"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + + rngvec6 = new float[so.getDimparameter()]; + rngvec7 = new float[so.getDimparameter()]; + rngvec8 = new float[so.getDimparameter()]; + rngvec9 = new float[so.getDimparameter()]; + rngvec10 = new float[so.getDimparameter()]; + rngvec11 = new float[so.getDimparameter()]; + rngvec12 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + Random r6 = new Random(); + Random r7 = new Random(); + Random r8 = new Random(); + Random r9 = new Random(); + Random r10 = new Random(); + Random r11 = new Random(); + Random r12 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec6[i] = (float) r6.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec7[i] = (float) r7.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec8[i] = (float) r8.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec9[i] = (float) r9.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec10[i] = (float) r10.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r11.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r12.nextGaussian(); + + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + + + List data1 = null; + data1 = so.getRawData(); + + List elbow_wcss = new ArrayList<>(); + + for (int k=min_k; k<=max_k ;k++) { + + aggloOffline2.setK(k); + List cent1 = aggloOffline2.getCentroids(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cent1, data1)); + long tempu = (long) StatTests.WCSSECentroidsFloat(cent1, data1); + elbow_wcss.add(tempu); + } + + + // finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + int num_of_clusters_stage2 = min_k + elbowcalculator.find_elbow(elbow_wcss); + + System.out.println("\n" + "No. of clusters_stage_2_Final = " + num_of_clusters_stage2 ); + + System.out.println("\n" + "No. of Data Points = " + so.getRawData().size() ); + + + // final choice of centroids : ( repetative calculation , please optimize ) + aggloOffline2.setK(num_of_clusters_stage2); + + this.centroids = aggloOffline2.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException , InterruptedException { + + // int k = 10;//6; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + // String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_mainfunc_1" ; + + // float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + // "C:\Users\sayan\OneDrive - University of Cincinnati\Documents\downloaded\run_results\run_results\3runs\har_k6\1D.txt" + + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" ; + System.out.println(inputfile); + + data = VectorUtil.readFile( inputfile , raw); + + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data, dummyk); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_10runs rphit = new TWRPv6_wcss_offline2_TEST2_10runs(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +"_"+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingcents.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingcents.java new file mode 100644 index 0000000..018da2c --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingcents.java @@ -0,0 +1,1419 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; +import edu.uc.rphash.aging.ageCentriods; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 10 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_10runs_agingcents implements Clusterer, Runnable { + + boolean znorm = false; + int [] num_of_clusters_stage1 = new int[12]; + int min_k; + int max_k; + + // convert this to an array of arrays + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + private float[] rngvec6; + private float[] rngvec7; + private float[] rngvec8; + private float[] rngvec9; + private float[] rngvec10; + private float[] rngvec11; + private float[] rngvec12; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_10runs_agingcents(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + HashMap MapOfIDAndCent6 = new HashMap<>(); + HashMap MapOfIDAndCount6 = new HashMap<>(); + HashMap MapOfIDAndWCSS6 = new HashMap<>(); + + HashMap MapOfIDAndCent7 = new HashMap<>(); + HashMap MapOfIDAndCount7 = new HashMap<>(); + HashMap MapOfIDAndWCSS7 = new HashMap<>(); + + HashMap MapOfIDAndCent8 = new HashMap<>(); + HashMap MapOfIDAndCount8 = new HashMap<>(); + HashMap MapOfIDAndWCSS8 = new HashMap<>(); + + HashMap MapOfIDAndCent9 = new HashMap<>(); + HashMap MapOfIDAndCount9 = new HashMap<>(); + HashMap MapOfIDAndWCSS9 = new HashMap<>(); + + HashMap MapOfIDAndCent10 = new HashMap<>(); + HashMap MapOfIDAndCount10 = new HashMap<>(); + HashMap MapOfIDAndWCSS10 = new HashMap<>(); + + HashMap MapOfIDAndCent11 = new HashMap<>(); + HashMap MapOfIDAndCount11 = new HashMap<>(); + HashMap MapOfIDAndWCSS11 = new HashMap<>(); + + HashMap MapOfIDAndCent12 = new HashMap<>(); + HashMap MapOfIDAndCount12 = new HashMap<>(); + HashMap MapOfIDAndWCSS12 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector.init(); + + // #create projector matrixs + Projector projector2 = so.getProjectionType(); + projector2.setOrigDim(so.getdim()); + projector2.setProjectedDim(so.getDimparameter()); + projector2.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector2.init(); + + // #create projector matrixs + Projector projector3 = so.getProjectionType(); + projector3.setOrigDim(so.getdim()); + projector3.setProjectedDim(so.getDimparameter()); + projector3.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector3.init(); + + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + + addtocounter(x, projector2, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + addtocounter(x, projector2, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector2, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector2, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + + addtocounter(x, projector3, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector3, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector3, MapOfIDAndCent11, MapOfIDAndCount11,ct++, rngvec11,MapOfIDAndWCSS11); + addtocounter(x, projector3, MapOfIDAndCent12, MapOfIDAndCount12,ct++, rngvec12,MapOfIDAndWCSS12); + + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_6 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount6.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount6.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount6.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_6.put(parent_id, 0L); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_6.remove(parent_id); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_7 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount7.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount7.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount7.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_7.put(parent_id, 0L); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_7.remove(parent_id); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_8 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount8.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount8.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount8.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_8.put(parent_id, 0L); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_8.remove(parent_id); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_9 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount9.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount9.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount9.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_9.put(parent_id, 0L); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_9.remove(parent_id); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_10 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount10.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount10.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount10.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_10.put(parent_id, 0L); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_10.remove(parent_id); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_11 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount11.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount11.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount11.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_11.put(parent_id, 0L); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_11.remove(parent_id); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_12 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount12.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount12.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount12.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_12.put(parent_id, 0L); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_12.remove(parent_id); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + } + } + } + } + } + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_6= new ArrayList<>(); + // sort and limit the list + stream2_6.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); + + + Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_7= new ArrayList<>(); + // sort and limit the list + stream2_7.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); + + + Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_8= new ArrayList<>(); + // sort and limit the list + stream2_8.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); + + + Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_9= new ArrayList<>(); + // sort and limit the list + stream2_9.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); + + + Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_10= new ArrayList<>(); + // sort and limit the list + stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); + + + Stream> stream2_11 = denseSetOfIDandCount2_11.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_11= new ArrayList<>(); + // sort and limit the list + stream2_11.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_11.add(x.getKey())); + + Stream> stream2_12 = denseSetOfIDandCount2_12.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_12= new ArrayList<>(); + // sort and limit the list + stream2_12.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_12.add(x.getKey())); + +// finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + num_of_clusters_stage1[0]= elbowcalculator.find_elbow(sortedIDList2_1); + num_of_clusters_stage1[1]= elbowcalculator.find_elbow(sortedIDList2_2); + num_of_clusters_stage1[2]= elbowcalculator.find_elbow(sortedIDList2_3); + num_of_clusters_stage1[3]= elbowcalculator.find_elbow(sortedIDList2_4); + num_of_clusters_stage1[4]= elbowcalculator.find_elbow(sortedIDList2_5); + num_of_clusters_stage1[5]= elbowcalculator.find_elbow(sortedIDList2_6); + num_of_clusters_stage1[6]= elbowcalculator.find_elbow(sortedIDList2_7); + num_of_clusters_stage1[7]= elbowcalculator.find_elbow(sortedIDList2_8); + num_of_clusters_stage1[8]= elbowcalculator.find_elbow(sortedIDList2_9); + num_of_clusters_stage1[9]= elbowcalculator.find_elbow(sortedIDList2_10); + num_of_clusters_stage1[10]= elbowcalculator.find_elbow(sortedIDList2_11); + num_of_clusters_stage1[11]= elbowcalculator.find_elbow(sortedIDList2_12); + + for (int i=0 ; i<12; i++) { + + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); +//// System.out.println("\n" + "No. of clusters_stage1 = " + num_of_clusters_stage1[i]); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); +//// System.out.println( "************************************************************" ); + sum_jt = sum_jt + num_of_clusters_stage1[i]; + } + System.out.println("\n" + "sum of No. of clusters_stage1 = " + sum_jt); + + double avg_clus_stage1 = Math.ceil(sum_jt / 12.0) ; + System.out.println("\n" + "Average of No. of clusters_stage1 = " + avg_clus_stage1 ); + System.out.println( "************************************************************" ); + // finding the range of K for offline clustering : + min_k = (int) (avg_clus_stage1 - 3) ; + + max_k = (int) (avg_clus_stage1 + 3) ; + + if (min_k < 3 ) { min_k = 3 ; max_k = 9; } ; + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; + float WCSS10 = 0; + float WCSS11 = 0; + float WCSS12 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + + for (Long keys: sortedIDList2_6){ + WCSS6 = WCSS6 + MapOfIDAndWCSS6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS7 = WCSS7 + MapOfIDAndWCSS7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS8 = WCSS8 + MapOfIDAndWCSS8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS9 = WCSS9 + MapOfIDAndWCSS9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} + + for (Long keys: sortedIDList2_11) + { WCSS11 = WCSS11 + MapOfIDAndWCSS11.get(keys);} + + for (Long keys: sortedIDList2_12) + { WCSS12 = WCSS12 + MapOfIDAndWCSS12.get(keys);} + + + + System.out.print(" wcss1 = " + WCSS1); + + System.out.print(" wcss2 = " + WCSS2); + + System.out.print(" wcss3 = " + WCSS3); + + System.out.print(" wcss4 = " + WCSS4); + + System.out.print(" wcss5 = " + WCSS5); + + System.out.print(" wcss6 = " + WCSS6); + + System.out.print(" wcss7 = " + WCSS7); + + System.out.print(" wcss8 = " + WCSS8); + + System.out.print(" wcss9 = " + WCSS9); + + System.out.print(" wcss10 = " + WCSS10); + + System.out.print(" wcss11 = " + WCSS11); + + System.out.print(" wcss12 = " + WCSS12); + +// float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + float arr[] = {WCSS1,WCSS2,WCSS3,WCSS4,WCSS5,WCSS6,WCSS7,WCSS8,WCSS9,WCSS10, WCSS11, WCSS12 }; + + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + if (index_of_max == 5) { + MapOfIDAndCount = MapOfIDAndCount6; + MapOfIDAndCent = MapOfIDAndCent6; + MapOfIDAndWCSS = MapOfIDAndWCSS6; + denseSetOfIDandCount2 = denseSetOfIDandCount2_6; + System.out.println("winner = tree6"); + } + if (index_of_max == 6) { + MapOfIDAndCount = MapOfIDAndCount7; + MapOfIDAndCent = MapOfIDAndCent7; + MapOfIDAndWCSS = MapOfIDAndWCSS7; + denseSetOfIDandCount2 = denseSetOfIDandCount2_7; + System.out.println("winner = tree7"); + } + if (index_of_max == 7) { + MapOfIDAndCount = MapOfIDAndCount8; + MapOfIDAndCent = MapOfIDAndCent8; + MapOfIDAndWCSS = MapOfIDAndWCSS8; + denseSetOfIDandCount2 = denseSetOfIDandCount2_8; + System.out.println("winner = tree8"); + } + if (index_of_max == 8) { + MapOfIDAndCount = MapOfIDAndCount9; + MapOfIDAndCent = MapOfIDAndCent9; + MapOfIDAndWCSS = MapOfIDAndWCSS9; + denseSetOfIDandCount2 = denseSetOfIDandCount2_9; + System.out.println("winner = tree9"); + } + if (index_of_max == 9) { + MapOfIDAndCount = MapOfIDAndCount10; + MapOfIDAndCent = MapOfIDAndCent10; + MapOfIDAndWCSS = MapOfIDAndWCSS10; + denseSetOfIDandCount2 = denseSetOfIDandCount2_10; + System.out.println("winner = tree10"); + } + if (index_of_max == 10) { + MapOfIDAndCount = MapOfIDAndCount11; + MapOfIDAndCent = MapOfIDAndCent11; + MapOfIDAndWCSS = MapOfIDAndWCSS11; + denseSetOfIDandCount2 = denseSetOfIDandCount2_11; + System.out.println("winner = tree11"); + } + if (index_of_max == 11) { + MapOfIDAndCount = MapOfIDAndCount12; + MapOfIDAndCent = MapOfIDAndCent12; + MapOfIDAndWCSS = MapOfIDAndWCSS12; + denseSetOfIDandCount2 = denseSetOfIDandCount2_12; + System.out.println("winner = tree12"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + + rngvec6 = new float[so.getDimparameter()]; + rngvec7 = new float[so.getDimparameter()]; + rngvec8 = new float[so.getDimparameter()]; + rngvec9 = new float[so.getDimparameter()]; + rngvec10 = new float[so.getDimparameter()]; + rngvec11 = new float[so.getDimparameter()]; + rngvec12 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + Random r6 = new Random(); + Random r7 = new Random(); + Random r8 = new Random(); + Random r9 = new Random(); + Random r10 = new Random(); + Random r11 = new Random(); + Random r12 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec6[i] = (float) r6.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec7[i] = (float) r7.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec8[i] = (float) r8.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec9[i] = (float) r9.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec10[i] = (float) r10.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r11.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r12.nextGaussian(); + + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + // System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + + + List data1 = null; + data1 = so.getRawData(); + //System.out.println("\n" + "No. of Data Points = " + so.getRawData().size() ); + System.out.println("\n" + "No. of Data Points = " + data1.size() ); + + List elbow_wcss = new ArrayList<>(); + + for (int k=min_k; k<=max_k ;k++) { + + aggloOffline2.setK(k); + List cent1 = aggloOffline2.getCentroids(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cent1, data1)); + long tempu = (long) StatTests.WCSSECentroidsFloat(cent1, data1); + elbow_wcss.add(tempu); + } + + + // finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + int num_of_clusters_stage2 = min_k + elbowcalculator.find_elbow(elbow_wcss); + + System.out.println("\n" + "No. of clusters_stage_2_Final = " + num_of_clusters_stage2 ); + + + // final choice of centroids : ( repetative calculation , please optimize ) + aggloOffline2.setK(num_of_clusters_stage2); + + this.centroids = aggloOffline2.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException , InterruptedException { + + // int k = 10;//6; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + // String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_mainfunc_1" ; + + // float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + List data_in_round = new ArrayList() ; + // "C:\Users\sayan\OneDrive - University of Cincinnati\Documents\downloaded\run_results\run_results\3runs\har_k6\1D.txt" + + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" ; + System.out.println(inputfile); + + data = VectorUtil.readFile( inputfile , raw); + int count1=0; + int count2=0; + // List cents_aged = null ; //new ArrayList(); /// may required to be initialized + // List cents_prev_round = null ; /// may required to be properly initialized + + boolean flag = true; // indicates first round if true else is false + + + List cents_aged = null; //null ; //new ArrayList(); /// may required to be initialized + List cents_prev_round = new ArrayList() ; //null ; /// may required to be properly initialized + + int round = 0; + for (float[] element : data) + + + { + count1 = count1+1; + //System.out.println(count1); + //System.out.println(element); + data_in_round.add(data.get(count1-1)); + count2 = count2 +1; + + //System.out.println(count2); + + if (count2 >= 1000) { + //if (count2 == 10299) { + System.out.println(count2); + + round = round + 1; + System.out.println("round is : " + round + "\n" ); + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data_in_round, dummyk); + + + o.setDimparameter(16); + o.setCutoff(70); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_10runs_agingcents rphit = new TWRPv6_wcss_offline2_TEST2_10runs_agingcents(o); + long startTime = System.nanoTime(); + + + List centsr = null ; //null; // new ArrayList(); + centsr = rphit.getCentroids(); // check if overwritten ? otherwise clear + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data_in_round)); + System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); +// System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + // if prev round cents are null i.e. 1st round aged cents = cents of this round i.e no aging + // if prev round cents are there then merge aged cents with this round cents + //System.out.println(centsr); + //System.out.println(centsr.size() ); + + if (flag == true) { + + //System.out.println(cents_prev_round); + //System.out.println(centsr); + cents_prev_round = centsr ; + cents_aged = centsr; // have to modify + // Centroid.removeallobjects(cents_aged); // have to modify + flag=false ; + + } + + // cents_prev_round.clear(); + cents_prev_round = cents_aged; + + // System.out.println(centsr); + // System.out.println(cents_aged); + // System.out.println(cents_prev_round); + + int pos=0; + List test1 = new ArrayList(); + for (Centroid vector : centsr) { + // System.out.println(vector.dimensions + " dimensions " + "\n"); + pos=pos+1; + int index1 = VectorUtil.findNearestDistance(vector, cents_prev_round); + // System.out.println( " nearest one : " + index1 + "\n"); + + // call weighted_merge(float cnt_1, float[] x_1, float cnt_2, float[] x_2) + float[] current_cent = vector.centroid(); + float[] prev_mapped_cent= cents_prev_round.get(index1).centroid(); + double weight1= 1.0; + double weight2= 0.25; + float[][] ret = ageCentriods.weighted_merge( weight1 ,current_cent, weight2, prev_mapped_cent); + float[] cent_merge = ret[1] ; + Centroid test = new Centroid(cent_merge); //Centroid(float[] data) + test1.add(cent_merge); + + } + + cents_aged.clear(); + + int size = test1.size(); + for (int i=0; i<= size-1; i++ ) { + Centroid c = new Centroid(test1.get(i)); + cents_aged.add(c); }; + + System.out.println(cents_aged); + +// + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cents_aged, data_in_round)); + System.out.println("xxxxxxxxxxxxxxxx this is aged cents xxxxxxxxxxxxxxxxxxxxx"); + +// input: "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" + + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/to_del/har_k6_kmeans_70_cutoff"+"_"+round+"_"+".csv" ; + + VectorUtil.writeCentroidsToFile(new File(Output),cents_aged, false); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + data_in_round.clear(); + count2=0; + + // cents_prev_round.clear(); + // cents_prev_round = cents_aged; + + // System.out.println("ccccccccccccccccccccc this issize cccccccccccccccccccc : " + cents_prev_round.size() ); + + System.gc(); + + } // end if + + System.gc(); + + }// end for + + + System.gc(); + } // end main + + + + + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters.java new file mode 100644 index 0000000..ee58799 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters.java @@ -0,0 +1,1471 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import java.util.Collections; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; +import edu.uc.rphash.aging.ageCentriods; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + +import java.lang.*; + + + +// this algorithm runs twrp 10 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters implements Clusterer, Runnable { + + boolean znorm = false; + int [] num_of_clusters_stage1 = new int[12]; + int min_k; + int max_k; + + // convert this to an array of arrays + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + private float[] rngvec6; + private float[] rngvec7; + private float[] rngvec8; + private float[] rngvec9; + private float[] rngvec10; + private float[] rngvec11; + private float[] rngvec12; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + //private Multimap WeightsandCents ; + + +// public Multimap getMicroclusterWeightsandCents (RPHashObject so) { +// this.so = so; +// return getCentroids(); +// } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2(List data_in_round) { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + HashMap MapOfIDAndCent6 = new HashMap<>(); + HashMap MapOfIDAndCount6 = new HashMap<>(); + HashMap MapOfIDAndWCSS6 = new HashMap<>(); + + HashMap MapOfIDAndCent7 = new HashMap<>(); + HashMap MapOfIDAndCount7 = new HashMap<>(); + HashMap MapOfIDAndWCSS7 = new HashMap<>(); + + HashMap MapOfIDAndCent8 = new HashMap<>(); + HashMap MapOfIDAndCount8 = new HashMap<>(); + HashMap MapOfIDAndWCSS8 = new HashMap<>(); + + HashMap MapOfIDAndCent9 = new HashMap<>(); + HashMap MapOfIDAndCount9 = new HashMap<>(); + HashMap MapOfIDAndWCSS9 = new HashMap<>(); + + HashMap MapOfIDAndCent10 = new HashMap<>(); + HashMap MapOfIDAndCount10 = new HashMap<>(); + HashMap MapOfIDAndWCSS10 = new HashMap<>(); + + HashMap MapOfIDAndCent11 = new HashMap<>(); + HashMap MapOfIDAndCount11 = new HashMap<>(); + HashMap MapOfIDAndWCSS11 = new HashMap<>(); + + HashMap MapOfIDAndCent12 = new HashMap<>(); + HashMap MapOfIDAndCount12 = new HashMap<>(); + HashMap MapOfIDAndWCSS12 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector.init(); + + // #create projector matrixs + Projector projector2 = so.getProjectionType(); + projector2.setOrigDim(so.getdim()); + projector2.setProjectedDim(so.getDimparameter()); + projector2.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector2.init(); + + // #create projector matrixs + Projector projector3 = so.getProjectionType(); + projector3.setOrigDim(so.getdim()); + projector3.setProjectedDim(so.getDimparameter()); + projector3.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector3.init(); + + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : data_in_round) + + + + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + + addtocounter(x, projector2, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + addtocounter(x, projector2, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector2, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector2, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + + addtocounter(x, projector3, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector3, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector3, MapOfIDAndCent11, MapOfIDAndCount11,ct++, rngvec11,MapOfIDAndWCSS11); + addtocounter(x, projector3, MapOfIDAndCent12, MapOfIDAndCount12,ct++, rngvec12,MapOfIDAndWCSS12); + + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_6 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount6.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount6.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount6.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_6.put(parent_id, 0L); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_6.remove(parent_id); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_7 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount7.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount7.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount7.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_7.put(parent_id, 0L); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_7.remove(parent_id); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_8 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount8.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount8.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount8.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_8.put(parent_id, 0L); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_8.remove(parent_id); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_9 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount9.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount9.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount9.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_9.put(parent_id, 0L); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_9.remove(parent_id); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_10 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount10.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount10.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount10.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_10.put(parent_id, 0L); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_10.remove(parent_id); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_11 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount11.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount11.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount11.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_11.put(parent_id, 0L); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_11.remove(parent_id); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_12 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount12.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount12.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount12.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_12.put(parent_id, 0L); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_12.remove(parent_id); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + } + } + } + } + } + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_6= new ArrayList<>(); + // sort and limit the list + stream2_6.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); + + + Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_7= new ArrayList<>(); + // sort and limit the list + stream2_7.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); + + + Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_8= new ArrayList<>(); + // sort and limit the list + stream2_8.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); + + + Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_9= new ArrayList<>(); + // sort and limit the list + stream2_9.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); + + + Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_10= new ArrayList<>(); + // sort and limit the list + stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); + + + Stream> stream2_11 = denseSetOfIDandCount2_11.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_11= new ArrayList<>(); + // sort and limit the list + stream2_11.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_11.add(x.getKey())); + + Stream> stream2_12 = denseSetOfIDandCount2_12.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_12= new ArrayList<>(); + // sort and limit the list + stream2_12.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_12.add(x.getKey())); + +// finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + num_of_clusters_stage1[0]= elbowcalculator.find_elbow(sortedIDList2_1); + num_of_clusters_stage1[1]= elbowcalculator.find_elbow(sortedIDList2_2); + num_of_clusters_stage1[2]= elbowcalculator.find_elbow(sortedIDList2_3); + num_of_clusters_stage1[3]= elbowcalculator.find_elbow(sortedIDList2_4); + num_of_clusters_stage1[4]= elbowcalculator.find_elbow(sortedIDList2_5); + num_of_clusters_stage1[5]= elbowcalculator.find_elbow(sortedIDList2_6); + num_of_clusters_stage1[6]= elbowcalculator.find_elbow(sortedIDList2_7); + num_of_clusters_stage1[7]= elbowcalculator.find_elbow(sortedIDList2_8); + num_of_clusters_stage1[8]= elbowcalculator.find_elbow(sortedIDList2_9); + num_of_clusters_stage1[9]= elbowcalculator.find_elbow(sortedIDList2_10); + num_of_clusters_stage1[10]= elbowcalculator.find_elbow(sortedIDList2_11); + num_of_clusters_stage1[11]= elbowcalculator.find_elbow(sortedIDList2_12); + + for (int i=0 ; i<12; i++) { + + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); +//// System.out.println("\n" + "No. of clusters_stage1 = " + num_of_clusters_stage1[i]); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); +//// System.out.println( "************************************************************" ); + sum_jt = sum_jt + num_of_clusters_stage1[i]; + } +//// System.out.println("\n" + "sum of No. of clusters_stage1 = " + sum_jt); + + double avg_clus_stage1 = Math.ceil(sum_jt / 12.0) ; + System.out.println("\n" + "Average of No. of clusters_stage1 = " + avg_clus_stage1 ); + System.out.println( "************************************************************" ); + // finding the range of K for offline clustering : + min_k = (int) (avg_clus_stage1 - 3) ; + + max_k = (int) (avg_clus_stage1 + 3) ; + + if (min_k < 3 ) { min_k = 3 ; max_k = 9; } ; + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; + float WCSS10 = 0; + float WCSS11 = 0; + float WCSS12 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + + for (Long keys: sortedIDList2_6){ + WCSS6 = WCSS6 + MapOfIDAndWCSS6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS7 = WCSS7 + MapOfIDAndWCSS7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS8 = WCSS8 + MapOfIDAndWCSS8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS9 = WCSS9 + MapOfIDAndWCSS9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} + + for (Long keys: sortedIDList2_11) + { WCSS11 = WCSS11 + MapOfIDAndWCSS11.get(keys);} + + for (Long keys: sortedIDList2_12) + { WCSS12 = WCSS12 + MapOfIDAndWCSS12.get(keys);} + + + + System.out.print(" wcss1 = " + WCSS1); + + System.out.print(" wcss2 = " + WCSS2); + + System.out.print(" wcss3 = " + WCSS3); + + System.out.print(" wcss4 = " + WCSS4); + + System.out.print(" wcss5 = " + WCSS5); + + System.out.print(" wcss6 = " + WCSS6); + + System.out.print(" wcss7 = " + WCSS7); + + System.out.print(" wcss8 = " + WCSS8); + + System.out.print(" wcss9 = " + WCSS9); + + System.out.print(" wcss10 = " + WCSS10); + + System.out.print(" wcss11 = " + WCSS11); + + System.out.print(" wcss12 = " + WCSS12); + +// float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + float arr[] = {WCSS1,WCSS2,WCSS3,WCSS4,WCSS5,WCSS6,WCSS7,WCSS8,WCSS9,WCSS10, WCSS11, WCSS12 }; + + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + if (index_of_max == 5) { + MapOfIDAndCount = MapOfIDAndCount6; + MapOfIDAndCent = MapOfIDAndCent6; + MapOfIDAndWCSS = MapOfIDAndWCSS6; + denseSetOfIDandCount2 = denseSetOfIDandCount2_6; + System.out.println("winner = tree6"); + } + if (index_of_max == 6) { + MapOfIDAndCount = MapOfIDAndCount7; + MapOfIDAndCent = MapOfIDAndCent7; + MapOfIDAndWCSS = MapOfIDAndWCSS7; + denseSetOfIDandCount2 = denseSetOfIDandCount2_7; + System.out.println("winner = tree7"); + } + if (index_of_max == 7) { + MapOfIDAndCount = MapOfIDAndCount8; + MapOfIDAndCent = MapOfIDAndCent8; + MapOfIDAndWCSS = MapOfIDAndWCSS8; + denseSetOfIDandCount2 = denseSetOfIDandCount2_8; + System.out.println("winner = tree8"); + } + if (index_of_max == 8) { + MapOfIDAndCount = MapOfIDAndCount9; + MapOfIDAndCent = MapOfIDAndCent9; + MapOfIDAndWCSS = MapOfIDAndWCSS9; + denseSetOfIDandCount2 = denseSetOfIDandCount2_9; + System.out.println("winner = tree9"); + } + if (index_of_max == 9) { + MapOfIDAndCount = MapOfIDAndCount10; + MapOfIDAndCent = MapOfIDAndCent10; + MapOfIDAndWCSS = MapOfIDAndWCSS10; + denseSetOfIDandCount2 = denseSetOfIDandCount2_10; + System.out.println("winner = tree10"); + } + if (index_of_max == 10) { + MapOfIDAndCount = MapOfIDAndCount11; + MapOfIDAndCent = MapOfIDAndCent11; + MapOfIDAndWCSS = MapOfIDAndWCSS11; + denseSetOfIDandCount2 = denseSetOfIDandCount2_11; + System.out.println("winner = tree11"); + } + if (index_of_max == 11) { + MapOfIDAndCount = MapOfIDAndCount12; + MapOfIDAndCent = MapOfIDAndCent12; + MapOfIDAndWCSS = MapOfIDAndWCSS12; + denseSetOfIDandCount2 = denseSetOfIDandCount2_12; + System.out.println("winner = tree12"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + return multimapWeightAndCent; + + +} + + + // this method gets the hashmap of ids and counts for the top n(cutoff) microclusters +// public static HashMap getmicroclusterIDandCount + + + + // this method gets the hashmap of ids and centroids for the top n(cutoff) microclusters + +// public static HashMap getmicroclusterIDandCents + + + + // this method gets the multihashmap of counts and centroids for the top n(cutoff) microclusters + + + public void run() { + + + List data_in_round = new ArrayList() ; + int count1=0; + int count2=0; + // List cents_aged = null ; /// may required to be initialized + // List cents_prev_round = null ; /// may required to be properly initialized + + boolean flag = true; // indicates first round if true else is false + Multimap WeightAndClusters = ArrayListMultimap.create() ; // null; + Multimap WeightAndClusters_prev = ArrayListMultimap.create() ; // null; + // Multimap WeightAndClusters_aged = ArrayListMultimap.create() ; // null; + + Listcentroids_prev = new ArrayList<>(); + List weights_prev =new ArrayList<>(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + Listcentroids3 = new ArrayList<>(); + List weights3 =new ArrayList<>(); + + //System.out.println(count2); + + for (float[] x : so.getRawData()) { + + count1 = count1+1; + //System.out.println(count1); + //System.out.println(element); + //data_in_round.add(so.getRawData().get(count1)); + + data_in_round.add(x); + count1 = count1+1; + count2 = count2 +1; + + //System.out.println(count2); + + if (count2 >= 2500) { + + System.out.println(count2); + + + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + rngvec6 = new float[so.getDimparameter()]; + rngvec7 = new float[so.getDimparameter()]; + rngvec8 = new float[so.getDimparameter()]; + rngvec9 = new float[so.getDimparameter()]; + rngvec10 = new float[so.getDimparameter()]; + rngvec11 = new float[so.getDimparameter()]; + rngvec12 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + Random r6 = new Random(); + Random r7 = new Random(); + Random r8 = new Random(); + Random r9 = new Random(); + Random r10 = new Random(); + Random r11 = new Random(); + Random r12 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec6[i] = (float) r6.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec7[i] = (float) r7.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec8[i] = (float) r8.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec9[i] = (float) r9.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec10[i] = (float) r10.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r11.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r12.nextGaussian(); + + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + WeightAndClusters = findDensityModes2(data_in_round); + + if ( flag == true) { + WeightAndClusters_prev = WeightAndClusters; + flag = false ; + } + + // WeightAndClusters_prev=WeightAndClusters_aged; + + // WeightAndClusters_aged.clear(); + + // System.out.println("multimap = "+ WeightAndClusters); + + // System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + // System.out.println("getRandomVector = "+ randVect); + // System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + weights2.add((float)weights); + } + System.out.println("curr_keys = "+ weights2); + + for (Long weight : WeightAndClusters.keySet()) + + { + centroids2.addAll(WeightAndClusters.get(weight)); + } + + + weights_prev.clear(); + + for (Long weights : WeightAndClusters_prev.keys()) + + { + float temp = (float) (0.25 * weights); + weights_prev.add((float)temp); + } + + System.out.println("kweighted_prev= "+ weights_prev); + + + centroids_prev.clear(); + for (Long weights : WeightAndClusters_prev.keySet()) + + { + centroids_prev.addAll(WeightAndClusters_prev.get(weights)); + } + + + + + for ( float w : weights_prev) + + { + weights2.add(w); + } + + System.out.println("keys_joined = "+ weights2); + + for (float[] c : centroids_prev) + + { + + centroids2.add(c); + + } + + // System.out.println("merged weights size = "+ weights2.size()); + // System.out.println("merged cents size = "+ centroids2.size()); + + // trim the weights2 and centroids2 to fix size : + // logic: select the top n weights and its index from weights2 , then select the centroids from those index in centroids2 + + // Collections.sort(weights2, Collections.reverseOrder()); + // weights2.sort(Comparator.reverseOrder()); + + + int[] sortedIndices = IntStream.range(0, weights2.size()) + .boxed().sorted((i, j) -> weights2.get(j).compareTo( weights2.get(i)) ) + .mapToInt(ele -> ele).toArray(); + System.out.println("sorted_index= "+ Arrays. toString(sortedIndices)); + + // create weights3 and centroid3 and then select the top 60 or cutoff elements. + + int limit=so.getCutoff() + 10; + + for (int i=0; i<= limit ;i++) + + { + int indx = sortedIndices[i] ; // check + + Float key_in_indx = weights2.get(indx); // weights2 is list of floats + + weights3.add( key_in_indx); + + float[] cent_in_indx = centroids2.get(indx); + + centroids3.add(cent_in_indx); + + } + + System.out.println("keys_joined = "+ weights3); + + System.out.println("size of weights3 = "+ weights3.size()); + + System.out.println("size of centroids3 = "+ centroids3.size()); + + // create a multimap and add the weights 3 and centriods3 and set it as agedmultimap + + // also create the multimap aged from this weights2 and cents2. + + Multimap WeightAndClusters_aged = ArrayListMultimap.create(); + + WeightAndClusters_aged.clear(); + + for (int i=0; i< weights3.size(); i++) + + { + WeightAndClusters_aged.put((weights3.get(i).longValue()), (float[]) (centroids3.get(i))); + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setRawData(centroids3); + aggloOffline2.setWeights(weights3); + + + List elbow_wcss = new ArrayList<>(); + + for (int k=min_k; k<=max_k ;k++) { + + aggloOffline2.setK(k); + List cent1 = aggloOffline2.getCentroids(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cent1, data_in_round)); + long tempu = (long) StatTests.WCSSECentroidsFloat(cent1, data_in_round); + elbow_wcss.add(tempu); + } + + + // finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + int num_of_clusters_stage2 = min_k + elbowcalculator.find_elbow(elbow_wcss); + + + System.out.println("\n" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx " ); + System.out.println("\n" + " No. of clusters_stage_2_Final = " + num_of_clusters_stage2 ); + System.out.println("\n" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx " ); + + System.out.println("\n" + "No. of Data Points = " + so.getRawData().size() ); + + + // final choice of centroids : ( repetative calculation , please optimize ) + aggloOffline2.setK(num_of_clusters_stage2); + + this.centroids = aggloOffline2.getCentroids(); + + count2=0; + data_in_round.clear(); + weights2.clear(); + centroids2.clear(); + + weights3.clear(); + centroids3.clear(); + + WeightAndClusters.clear(); + + WeightAndClusters_prev.clear(); + + WeightAndClusters_prev = WeightAndClusters_aged; + + } // end of the if loop + + + } // end of the for loop + + + } // end of run method + + + public static void main(String[] args) throws FileNotFoundException, + IOException , InterruptedException { + + + float avgtime = 0; + // System.out.printf("%f\t", f); + + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + + // "C:\Users\sayan\OneDrive - University of Cincinnati\Documents\downloaded\run_results\run_results\3runs\har_k6\1D.txt" + + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" ; + System.out.println(inputfile); + + data = VectorUtil.readFile( inputfile , raw); + + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data, dummyk); + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + + TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters rphit = new TWRPv6_wcss_offline2_TEST2_10runs_agingmicroclusters(o); + long startTime = System.nanoTime(); + // rphit.getCentroids(); + rphit.run(); + +// avgtime += (System.nanoTime() - startTime) / 100000000; + + + System.gc(); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_static_stream.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_static_stream.java new file mode 100644 index 0000000..4273119 --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_10runs_static_stream.java @@ -0,0 +1,1365 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.kneefinder.JythonTest; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 10 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_10runs_static_stream implements Clusterer, Runnable { + + boolean znorm = false; + int [] num_of_clusters_stage1 = new int[12]; + int min_k; + int max_k; + + // convert this to an array of arrays + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + private float[] rngvec6; + private float[] rngvec7; + private float[] rngvec8; + private float[] rngvec9; + private float[] rngvec10; + private float[] rngvec11; + private float[] rngvec12; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_10runs_static_stream(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + HashMap MapOfIDAndCent6 = new HashMap<>(); + HashMap MapOfIDAndCount6 = new HashMap<>(); + HashMap MapOfIDAndWCSS6 = new HashMap<>(); + + HashMap MapOfIDAndCent7 = new HashMap<>(); + HashMap MapOfIDAndCount7 = new HashMap<>(); + HashMap MapOfIDAndWCSS7 = new HashMap<>(); + + HashMap MapOfIDAndCent8 = new HashMap<>(); + HashMap MapOfIDAndCount8 = new HashMap<>(); + HashMap MapOfIDAndWCSS8 = new HashMap<>(); + + HashMap MapOfIDAndCent9 = new HashMap<>(); + HashMap MapOfIDAndCount9 = new HashMap<>(); + HashMap MapOfIDAndWCSS9 = new HashMap<>(); + + HashMap MapOfIDAndCent10 = new HashMap<>(); + HashMap MapOfIDAndCount10 = new HashMap<>(); + HashMap MapOfIDAndWCSS10 = new HashMap<>(); + + HashMap MapOfIDAndCent11 = new HashMap<>(); + HashMap MapOfIDAndCount11 = new HashMap<>(); + HashMap MapOfIDAndWCSS11 = new HashMap<>(); + + HashMap MapOfIDAndCent12 = new HashMap<>(); + HashMap MapOfIDAndCount12 = new HashMap<>(); + HashMap MapOfIDAndWCSS12 = new HashMap<>(); + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector.init(); + + // #create projector matrixs + Projector projector2 = so.getProjectionType(); + projector2.setOrigDim(so.getdim()); + projector2.setProjectedDim(so.getDimparameter()); + projector2.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector2.init(); + + // #create projector matrixs + Projector projector3 = so.getProjectionType(); + projector3.setOrigDim(so.getdim()); + projector3.setProjectedDim(so.getDimparameter()); + projector3.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + projector3.init(); + + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + + addtocounter(x, projector2, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + addtocounter(x, projector2, MapOfIDAndCent6, MapOfIDAndCount6,ct++, rngvec6, MapOfIDAndWCSS6); + addtocounter(x, projector2, MapOfIDAndCent7, MapOfIDAndCount7,ct++, rngvec7,MapOfIDAndWCSS7); + addtocounter(x, projector2, MapOfIDAndCent8, MapOfIDAndCount8,ct++, rngvec8,MapOfIDAndWCSS8); + + addtocounter(x, projector3, MapOfIDAndCent9, MapOfIDAndCount9,ct++, rngvec9,MapOfIDAndWCSS9); + addtocounter(x, projector3, MapOfIDAndCent10, MapOfIDAndCount10,ct++, rngvec10,MapOfIDAndWCSS10); + addtocounter(x, projector3, MapOfIDAndCent11, MapOfIDAndCount11,ct++, rngvec11,MapOfIDAndWCSS11); + addtocounter(x, projector3, MapOfIDAndCent12, MapOfIDAndCount12,ct++, rngvec12,MapOfIDAndWCSS12); + + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + HashMap denseSetOfIDandCount2_6 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount6.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount6.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount6.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_6.put(parent_id, 0L); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_6.remove(parent_id); + + MapOfIDAndCent6.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_6.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_7 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount7.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount7.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount7.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_7.put(parent_id, 0L); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_7.remove(parent_id); + + MapOfIDAndCent7.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_7.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_8 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount8.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount8.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount8.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_8.put(parent_id, 0L); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_8.remove(parent_id); + + MapOfIDAndCent8.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_8.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_9 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount9.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount9.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount9.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_9.put(parent_id, 0L); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_9.remove(parent_id); + + MapOfIDAndCent9.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_9.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_10 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount10.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount10.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount10.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_10.put(parent_id, 0L); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_10.remove(parent_id); + + MapOfIDAndCent10.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_10.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_11 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount11.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount11.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount11.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_11.put(parent_id, 0L); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_11.remove(parent_id); + + MapOfIDAndCent11.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_11.put(cur_id, (long) cur_count); + } + } + } + } + } + + HashMap denseSetOfIDandCount2_12 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount12.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount12.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount12.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_12.put(parent_id, 0L); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_12.remove(parent_id); + + MapOfIDAndCent12.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_12.put(cur_id, (long) cur_count); + } + } + } + } + } + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + Stream> stream2_6 = denseSetOfIDandCount2_6.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_6= new ArrayList<>(); + // sort and limit the list + stream2_6.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_6.add(x.getKey())); + + + Stream> stream2_7 = denseSetOfIDandCount2_7.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_7= new ArrayList<>(); + // sort and limit the list + stream2_7.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_7.add(x.getKey())); + + + Stream> stream2_8 = denseSetOfIDandCount2_8.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_8= new ArrayList<>(); + // sort and limit the list + stream2_8.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_8.add(x.getKey())); + + + Stream> stream2_9 = denseSetOfIDandCount2_9.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_9= new ArrayList<>(); + // sort and limit the list + stream2_9.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_9.add(x.getKey())); + + + Stream> stream2_10 = denseSetOfIDandCount2_10.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_10= new ArrayList<>(); + // sort and limit the list + stream2_10.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_10.add(x.getKey())); + + + Stream> stream2_11 = denseSetOfIDandCount2_11.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_11= new ArrayList<>(); + // sort and limit the list + stream2_11.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_11.add(x.getKey())); + + Stream> stream2_12 = denseSetOfIDandCount2_12.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_12= new ArrayList<>(); + // sort and limit the list + stream2_12.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_12.add(x.getKey())); + +// finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + num_of_clusters_stage1[0]= elbowcalculator.find_elbow(sortedIDList2_1); + num_of_clusters_stage1[1]= elbowcalculator.find_elbow(sortedIDList2_2); + num_of_clusters_stage1[2]= elbowcalculator.find_elbow(sortedIDList2_3); + num_of_clusters_stage1[3]= elbowcalculator.find_elbow(sortedIDList2_4); + num_of_clusters_stage1[4]= elbowcalculator.find_elbow(sortedIDList2_5); + num_of_clusters_stage1[5]= elbowcalculator.find_elbow(sortedIDList2_6); + num_of_clusters_stage1[6]= elbowcalculator.find_elbow(sortedIDList2_7); + num_of_clusters_stage1[7]= elbowcalculator.find_elbow(sortedIDList2_8); + num_of_clusters_stage1[8]= elbowcalculator.find_elbow(sortedIDList2_9); + num_of_clusters_stage1[9]= elbowcalculator.find_elbow(sortedIDList2_10); + num_of_clusters_stage1[10]= elbowcalculator.find_elbow(sortedIDList2_11); + num_of_clusters_stage1[11]= elbowcalculator.find_elbow(sortedIDList2_12); + + for (int i=0 ; i<12; i++) { + + //int num_of_clusters_2= elbowcalculator.find_elbow(counts); + System.out.println("\n" + "No. of clusters_stage1 = " + num_of_clusters_stage1[i]); + //System.out.println( "No. of clusters_by_COUNT = " + num_of_clusters_2); + System.out.println( "************************************************************" ); + sum_jt = sum_jt + num_of_clusters_stage1[i]; + } + System.out.println("\n" + "sum of No. of clusters_stage1 = " + sum_jt); + + double avg_clus_stage1 = Math.ceil(sum_jt / 12.0) ; + System.out.println("\n" + "Average of No. of clusters_stage1 = " + avg_clus_stage1 ); + System.out.println( "************************************************************" ); + // finding the range of K for offline clustering : + min_k = (int) (avg_clus_stage1 - 3) ; + + max_k = (int) (avg_clus_stage1 + 3) ; + + if (min_k < 3 ) { min_k = 3 ; max_k = 9; } ; + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + float WCSS6 = 0; + float WCSS7 = 0; + float WCSS8 = 0; + float WCSS9 = 0; + float WCSS10 = 0; + float WCSS11 = 0; + float WCSS12 = 0; + + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + + for (Long keys: sortedIDList2_6){ + WCSS6 = WCSS6 + MapOfIDAndWCSS6.get(keys);} + + for (Long keys: sortedIDList2_7) + { WCSS7 = WCSS7 + MapOfIDAndWCSS7.get(keys);} + + for (Long keys: sortedIDList2_8) + { WCSS8 = WCSS8 + MapOfIDAndWCSS8.get(keys);} + + for (Long keys: sortedIDList2_9) + { WCSS9 = WCSS9 + MapOfIDAndWCSS9.get(keys);} + + for (Long keys: sortedIDList2_10) + { WCSS10 = WCSS10 + MapOfIDAndWCSS10.get(keys);} + + for (Long keys: sortedIDList2_11) + { WCSS11 = WCSS11 + MapOfIDAndWCSS11.get(keys);} + + for (Long keys: sortedIDList2_12) + { WCSS12 = WCSS12 + MapOfIDAndWCSS12.get(keys);} + + + + System.out.print(" wcss1 = " + WCSS1); + + System.out.print(" wcss2 = " + WCSS2); + + System.out.print(" wcss3 = " + WCSS3); + + System.out.print(" wcss4 = " + WCSS4); + + System.out.print(" wcss5 = " + WCSS5); + + System.out.print(" wcss6 = " + WCSS6); + + System.out.print(" wcss7 = " + WCSS7); + + System.out.print(" wcss8 = " + WCSS8); + + System.out.print(" wcss9 = " + WCSS9); + + System.out.print(" wcss10 = " + WCSS10); + + System.out.print(" wcss11 = " + WCSS11); + + System.out.print(" wcss12 = " + WCSS12); + +// float arr[] = {WCSS_off_1,WCSS_off_2,WCSS_off_3,WCSS_off_4,WCSS_off_5,WCSS_off_6,WCSS_off_7,WCSS_off_8,WCSS_off_9,WCSS_off_10}; + float arr[] = {WCSS1,WCSS2,WCSS3,WCSS4,WCSS5,WCSS6,WCSS7,WCSS8,WCSS9,WCSS10, WCSS11, WCSS12 }; + + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + if (index_of_max == 5) { + MapOfIDAndCount = MapOfIDAndCount6; + MapOfIDAndCent = MapOfIDAndCent6; + MapOfIDAndWCSS = MapOfIDAndWCSS6; + denseSetOfIDandCount2 = denseSetOfIDandCount2_6; + System.out.println("winner = tree6"); + } + if (index_of_max == 6) { + MapOfIDAndCount = MapOfIDAndCount7; + MapOfIDAndCent = MapOfIDAndCent7; + MapOfIDAndWCSS = MapOfIDAndWCSS7; + denseSetOfIDandCount2 = denseSetOfIDandCount2_7; + System.out.println("winner = tree7"); + } + if (index_of_max == 7) { + MapOfIDAndCount = MapOfIDAndCount8; + MapOfIDAndCent = MapOfIDAndCent8; + MapOfIDAndWCSS = MapOfIDAndWCSS8; + denseSetOfIDandCount2 = denseSetOfIDandCount2_8; + System.out.println("winner = tree8"); + } + if (index_of_max == 8) { + MapOfIDAndCount = MapOfIDAndCount9; + MapOfIDAndCent = MapOfIDAndCent9; + MapOfIDAndWCSS = MapOfIDAndWCSS9; + denseSetOfIDandCount2 = denseSetOfIDandCount2_9; + System.out.println("winner = tree9"); + } + if (index_of_max == 9) { + MapOfIDAndCount = MapOfIDAndCount10; + MapOfIDAndCent = MapOfIDAndCent10; + MapOfIDAndWCSS = MapOfIDAndWCSS10; + denseSetOfIDandCount2 = denseSetOfIDandCount2_10; + System.out.println("winner = tree10"); + } + if (index_of_max == 10) { + MapOfIDAndCount = MapOfIDAndCount11; + MapOfIDAndCent = MapOfIDAndCent11; + MapOfIDAndWCSS = MapOfIDAndWCSS11; + denseSetOfIDandCount2 = denseSetOfIDandCount2_11; + System.out.println("winner = tree11"); + } + if (index_of_max == 11) { + MapOfIDAndCount = MapOfIDAndCount12; + MapOfIDAndCent = MapOfIDAndCent12; + MapOfIDAndWCSS = MapOfIDAndWCSS12; + denseSetOfIDandCount2 = denseSetOfIDandCount2_12; + System.out.println("winner = tree12"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + + rngvec6 = new float[so.getDimparameter()]; + rngvec7 = new float[so.getDimparameter()]; + rngvec8 = new float[so.getDimparameter()]; + rngvec9 = new float[so.getDimparameter()]; + rngvec10 = new float[so.getDimparameter()]; + rngvec11 = new float[so.getDimparameter()]; + rngvec12 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + Random r6 = new Random(); + Random r7 = new Random(); + Random r8 = new Random(); + Random r9 = new Random(); + Random r10 = new Random(); + Random r11 = new Random(); + Random r12 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec6[i] = (float) r6.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec7[i] = (float) r7.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec8[i] = (float) r8.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec9[i] = (float) r9.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec10[i] = (float) r10.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r11.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec11[i] = (float) r12.nextGaussian(); + + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + + + List data1 = null; + data1 = so.getRawData(); + + List elbow_wcss = new ArrayList<>(); + + for (int k=min_k; k<=max_k ;k++) { + + aggloOffline2.setK(k); + List cent1 = aggloOffline2.getCentroids(); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(cent1, data1)); + long tempu = (long) StatTests.WCSSECentroidsFloat(cent1, data1); + elbow_wcss.add(tempu); + } + + + // finding elbows + JythonTest elbowcalculator = new JythonTest(); + double sum_jt = 0; + + int num_of_clusters_stage2 = min_k + elbowcalculator.find_elbow(elbow_wcss); + + System.out.println("\n" + "No. of clusters_stage_2_Final = " + num_of_clusters_stage2 ); + + System.out.println("\n" + "No. of Data Points = " + so.getRawData().size() ); + + + // final choice of centroids : ( repetative calculation , please optimize ) + aggloOffline2.setK(num_of_clusters_stage2); + + this.centroids = aggloOffline2.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException , InterruptedException { + + // int k = 10;//6; + // int d = 200;//16; + // int n = 10000; + // float var = 1.5f; + // int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + // String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/10runs/OutputTwrpCents_mainfunc_1" ; + + // float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + List data_in_round = new ArrayList() ; + // "C:\Users\sayan\OneDrive - University of Cincinnati\Documents\downloaded\run_results\run_results\3runs\har_k6\1D.txt" + + String inputfile = "C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/run_results/run_results/3runs/har_k6/1D.txt" ; + System.out.println(inputfile); + + data = VectorUtil.readFile( inputfile , raw); + int count1=0; + int count2=0; + boolean flag = true; + + for (float[] element : data) + + { + count1 = count1+1; + //System.out.println(count1); + //System.out.println(element); + data_in_round.add(data.get(count1-1)); + count2 = count2 +1; + + //System.out.println(count2); + + //if (count2 >= 1000) { + if (count2 == 10299) { + System.out.println(count2); + + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data_in_round, dummyk); + + + o.setDimparameter(16); + o.setCutoff(70); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_10runs_static_stream rphit = new TWRPv6_wcss_offline2_TEST2_10runs_static_stream(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +"_"+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data_in_round)); + System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + data_in_round.clear(); + count2=0; + centsr.clear(); + + } + + //System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + + + } + + int dummyk = 8; + RPHashObject o = new SimpleArrayReader(data_in_round, dummyk); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_10runs_static_stream rphit = new TWRPv6_wcss_offline2_TEST2_10runs_static_stream(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + String Output = "/C:/Users/sayan/OneDrive - University of Cincinnati/Documents/downloaded/results/har_6clus/har_k6_kmeans_130_cutoff"+"_" +"_"+".csv" ; + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data_in_round)); + System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_5runs.java b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_5runs.java new file mode 100644 index 0000000..f67fd7e --- /dev/null +++ b/src/main/java/edu/uc/rphash/TWRPv6_wcss_offline2_TEST2_5runs.java @@ -0,0 +1,1062 @@ +package edu.uc.rphash; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.HashMap; +//import java.util.Iterator; +//import java.util.LinkedHashMap; +import java.util.List; +//import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.TreeSet; +import java.util.stream.Stream; + +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.Readers.SimpleArrayReader; +import edu.uc.rphash.projections.Projector; +import edu.uc.rphash.tests.StatTests; +import edu.uc.rphash.tests.clusterers.Agglomerative3; +import edu.uc.rphash.tests.clusterers.KMeans2; +import edu.uc.rphash.tests.clusterers.Agglomerative3.ClusteringType; +import edu.uc.rphash.tests.generators.GenerateData; +import edu.uc.rphash.util.VectorUtil; + +//import org.apache.commons.collections.map.MultiValueMap; +//import org.apache.commons.collections.map.*; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; + + + +// this algorithm runs twrp 5 times : (only the random bisection vector varies, the Projection matrix remains same) +// and selects the one which has the best wcss offline for the 10X candidate centroids. +public class TWRPv6_wcss_offline2_TEST2_5runs implements Clusterer, Runnable { + + boolean znorm = false; + + private int counter; + private float[] rngvec; + private float[] rngvec2; + private float[] rngvec3; + private float[] rngvec4; + private float[] rngvec5; + + private List centroids = null; + + private RPHashObject so; + + public TWRPv6_wcss_offline2_TEST2_5runs(RPHashObject so) { + this.so = so; + } + + public List getCentroids(RPHashObject so) { + this.so = so; + return getCentroids(); + } + + @Override + public List getCentroids() { + if (centroids == null) + run(); + return centroids; + } + + +// This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + +// This method finds the smallest of the numbers and returns that index. + + public static int smallest(float[] arr) + { + // Initialize minimum element + float min = arr[0]; + int minindex = 0; + // System.out.println(" LENGHT : " + arr.length); + // Traverse array elements from second and + // compare every element with current max + for (int i = 1; i < (arr.length); i++) { + // System.out.println("the min value of i : " + i); + if (arr[i]< min) { + min = arr[i]; + minindex = i; + } + } + // System.out.println("the min value is : " + min); + // System.out.println("the index for min val is : " + minindex); + return minindex; + } + + + /* + * X - set of vectors compute the medoid of a vector set + */ + float[] medoid(List X) { + float[] ret = X.get(0); + for (int i = 1; i < X.size(); i++) { + for (int j = 0; j < ret.length; j++) { + ret[j] += X.get(i)[j]; + } + } + for (int j = 0; j < ret.length; j++) { + ret[j] = ret[j] / ((float) X.size()); + } + return ret; + } + +// this updates the map two cents with different weights are merged into one. + public static float[][] UpdateHashMap(float cnt_1, float[] x_1, float wcss_1, + float cnt_2, float[] x_2 , float wcss_2) { + + float cnt_r = cnt_1 + cnt_2; + + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r; + + } + + float wcss = ( ((wcss_1 + distancesq(x_r,x_1)) ) + distancesq(x_r,x_2) ); + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[0][0] = cnt_r; + ret[1] = x_r; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + +// this method is used to calculate the offline wcss +// UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector, incomingWcss ); + + public static float[][] UpdateHashMap_offlineWcss(float[] x_1, float wcss_1,float[] x_2 ) { + + float wcss = wcss_1 + distancesq(x_1,x_2); + + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + ret[2]= new float [1]; + ret[2][0]= wcss; + return ret; + + } + + public long hashvec2( float[] xt, float[] x, + HashMap MapOfIDAndCent, HashMap MapOfIDAndCount, int ct, float[] rngvec, HashMap MapOfIDAndWCSS) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { +// s <<= 1; + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCount = MapOfIDAndCount.get(s); + float CurrentCent [] = MapOfIDAndCent.get(s); + float CountForIncomingVector = 1; + float IncomingVector [] = x; + float currentWcss= MapOfIDAndWCSS.get(s); + float incomingWcss= 0; + + float[][] MergedValues = UpdateHashMap(CurrentCount , CurrentCent, currentWcss, CountForIncomingVector, IncomingVector, incomingWcss ); + + Long UpdatedCount = (long) MergedValues[0][0] ; + + float[] MergedVector = MergedValues[1] ; + + float wcss= MergedValues[2][0]; + + MapOfIDAndCount.put(s , UpdatedCount); + + MapOfIDAndCent.put(s, MergedVector); + + MapOfIDAndWCSS.put(s, wcss); + + } + + else { + + float[] xlist = x; + MapOfIDAndCent.put(s, xlist); + MapOfIDAndCount.put(s, (long)1); + MapOfIDAndWCSS.put(s, (float)0); + } + } + return s; + } + +// this hash is to calculate the wcss +// hashvec2_forwcss(xt,x,IDAndCent,rngvec ,IDandWCSS); + + public long hashvec2_forwcss( float[] xt, float[] x, HashMap MapOfIDAndCent, float[] rngvec, HashMapIDandWCSS_offline) { + long s = 1; //fixes leading 0's bug + for (int i = 0; i < xt.length; i++) { + s = s << 1 ; // left shift the bits of s by 1. + if (xt[i] > rngvec[i]) + s= s+1; + + if (MapOfIDAndCent.containsKey(s)) { + + float CurrentCent [] = MapOfIDAndCent.get(s); + float IncomingVector [] = x; + + + float currentWcss= 0; + + if (IDandWCSS_offline.containsKey(s)) { + currentWcss= IDandWCSS_offline.get(s); + } + + float[][] MergedValues = UpdateHashMap_offlineWcss( CurrentCent, currentWcss, IncomingVector ); + + float wcss= MergedValues[2][0]; + + IDandWCSS_offline.put(s, wcss); + + } + } + return s; + } + + /* + * x - input vector IDAndCount - ID->count map IDAndCent - ID->centroid + * vector map + * + * hash the projected vector x and update the hash to centroid and counts + * maps + */ + void addtocounter(float[] x, Projector p, + HashMap IDAndCent,HashMap IDandID,int ct, float[] rngvec , HashMap IDandWCSS) { + float[] xt = p.project(x); + + hashvec2(xt,x,IDAndCent, IDandID, ct,rngvec , IDandWCSS); + } + + // this method is used to compute the offline WCSS to choose the best of the clusters + //calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec, MapOfIDAandWCSS1_offline); + + void calcWCSSoffline(float[] x, Projector p, HashMap MapOfIDAndCent, float[] rngvec , HashMap MapOfIDAandWCSS_offline) { + + float[] xt = p.project(x); + + hashvec2_forwcss(xt,x,MapOfIDAndCent,rngvec ,MapOfIDAandWCSS_offline); + + } + + static boolean isPowerOfTwo(long num) { + return (num & -num) == num; + } + + /* + * X - data set k - canonical k in k-means l - clustering sub-space Compute + * density mode via iterative deepening hash counting + */ + + public Multimap findDensityModes2() { + + HashMap MapOfIDAndCent1 = new HashMap<>(); + HashMap MapOfIDAndCount1 = new HashMap<>(); + HashMap MapOfIDAndWCSS1 = new HashMap<>(); + + HashMap MapOfIDAndCent2 = new HashMap<>(); + HashMap MapOfIDAndCount2 = new HashMap<>(); + HashMap MapOfIDAndWCSS2 = new HashMap<>(); + + HashMap MapOfIDAndCent3 = new HashMap<>(); + HashMap MapOfIDAndCount3 = new HashMap<>(); + HashMap MapOfIDAndWCSS3 = new HashMap<>(); + + HashMap MapOfIDAndCent4 = new HashMap<>(); + HashMap MapOfIDAndCount4 = new HashMap<>(); + HashMap MapOfIDAndWCSS4 = new HashMap<>(); + + HashMap MapOfIDAndCent5 = new HashMap<>(); + HashMap MapOfIDAndCount5 = new HashMap<>(); + HashMap MapOfIDAndWCSS5 = new HashMap<>(); + + + + + // #create projector matrixs + Projector projector = so.getProjectionType(); + projector.setOrigDim(so.getdim()); + projector.setProjectedDim(so.getDimparameter()); + projector.setRandomSeed(so.getRandomSeed()); +// projector.setRandomSeed(535247432); + + projector.init(); + int cutoff = so.getCutoff(); + + int ct = 0; + + { + + for (float[] x : so.getRawData()) + { + addtocounter(x, projector, MapOfIDAndCent1, MapOfIDAndCount1,ct++, rngvec, MapOfIDAndWCSS1); + addtocounter(x, projector, MapOfIDAndCent2, MapOfIDAndCount2,ct++, rngvec2,MapOfIDAndWCSS2); + addtocounter(x, projector, MapOfIDAndCent3, MapOfIDAndCount3,ct++, rngvec3,MapOfIDAndWCSS3); + addtocounter(x, projector, MapOfIDAndCent4, MapOfIDAndCount4,ct++, rngvec4,MapOfIDAndWCSS4); + addtocounter(x, projector, MapOfIDAndCent5, MapOfIDAndCount5,ct++, rngvec5,MapOfIDAndWCSS5); + + } + } + + System.out.println("NumberOfMicroClustersBeforePruning = "+ MapOfIDAndCent3.size()); + + // next we want to prune the tree by parent count comparison + // follows breadthfirst search + + HashMap denseSetOfIDandCount2_1 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount1.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount1.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount1.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_1.put(parent_id, 0L); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_1.remove(parent_id); + + MapOfIDAndCent1.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_1.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_2 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount2.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount2.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount2.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_2.put(parent_id, 0L); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_2.remove(parent_id); + + MapOfIDAndCent2.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_2.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_3 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount3.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount3.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount3.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_3.put(parent_id, 0L); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_3.remove(parent_id); + + MapOfIDAndCent3.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_3.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_4 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount4.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount4.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount4.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_4.put(parent_id, 0L); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_4.remove(parent_id); + + MapOfIDAndCent4.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_4.put(cur_id, (long) cur_count); + } + } + } + } + } + + + HashMap denseSetOfIDandCount2_5 = new HashMap(); + for (Long cur_id : new TreeSet(MapOfIDAndCount5.keySet())) + { + if (cur_id >so.getk()){ + int cur_count = (int) (MapOfIDAndCount5.get(cur_id).longValue()); + long parent_id = cur_id>>>1; + int parent_count = (int) (MapOfIDAndCount5.get(parent_id).longValue()); + + if(cur_count!=0 && parent_count!=0) + { + if(cur_count == parent_count) { + denseSetOfIDandCount2_5.put(parent_id, 0L); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + + } + else + { + if(2 * cur_count > parent_count) { + denseSetOfIDandCount2_5.remove(parent_id); + + MapOfIDAndCent5.put(parent_id, new float[]{}); + + denseSetOfIDandCount2_5.put(cur_id, (long) cur_count); + } + } + } + } + } + + + + //remove keys with support less than 1 + Stream> stream2_1 = denseSetOfIDandCount2_1.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_1= new ArrayList<>(); + // sort and limit the list + stream2_1.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_1.add(x.getKey())); + + + + Stream> stream2_2 = denseSetOfIDandCount2_2.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_2= new ArrayList<>(); + // sort and limit the list + stream2_2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_2.add(x.getKey())); + + + Stream> stream2_3 = denseSetOfIDandCount2_3.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_3= new ArrayList<>(); + // sort and limit the list + stream2_3.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_3.add(x.getKey())); + + + Stream> stream2_4 = denseSetOfIDandCount2_4.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_4= new ArrayList<>(); + // sort and limit the list + stream2_4.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_4.add(x.getKey())); + + Stream> stream2_5 = denseSetOfIDandCount2_5.entrySet().stream().filter(p -> p.getValue() > 1); + List sortedIDList2_5= new ArrayList<>(); + // sort and limit the list + stream2_5.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2_5.add(x.getKey())); + + + float WCSS1 = 0; + float WCSS2 = 0; + float WCSS3 = 0; + float WCSS4 = 0; + float WCSS5 = 0; + + float WCSS_off_1 = 0; + float WCSS_off_2 = 0; + float WCSS_off_3 = 0; + float WCSS_off_4 = 0; + float WCSS_off_5 = 0; + + HashMap denseSetOfIDandCount2 = new HashMap(); + HashMap MapOfIDAndCent = new HashMap<>(); + HashMap MapOfIDAndCount = new HashMap<>(); + HashMap MapOfIDAndWCSS = new HashMap<>(); + + HashMap MapOfIDAandWCSS_offline_1 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_2 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_3 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_4 = new HashMap<>(); + HashMap MapOfIDAandWCSS_offline_5 = new HashMap<>(); + + // calculate the real wcss in offline fashion, so for the keys , hash the points into those buckets + // and calculate the wcss as we know their centroids : + + + for (float[] x : so.getRawData()) + { + + calcWCSSoffline(x, projector, MapOfIDAndCent1, rngvec , MapOfIDAandWCSS_offline_1); + calcWCSSoffline(x, projector, MapOfIDAndCent2, rngvec2, MapOfIDAandWCSS_offline_2); + calcWCSSoffline(x, projector, MapOfIDAndCent3, rngvec3, MapOfIDAandWCSS_offline_3); + calcWCSSoffline(x, projector, MapOfIDAndCent4, rngvec4, MapOfIDAandWCSS_offline_4); + calcWCSSoffline(x, projector, MapOfIDAndCent5, rngvec5, MapOfIDAandWCSS_offline_5); + } + + + //* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH IS DONE ONLINE: + + for (Long keys: sortedIDList2_1){ + WCSS1 = WCSS1 + MapOfIDAndWCSS1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS2 = WCSS2 + MapOfIDAndWCSS2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS3 = WCSS3 + MapOfIDAndWCSS3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS4 = WCSS4 + MapOfIDAndWCSS4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS5 = WCSS5 + MapOfIDAndWCSS5.get(keys);} + +//* THIS IS THE RUNTIME CALCULATION OF WCSS STATISTICS WHICH REQUIRES ANOTHER PASS OVER THE DATA: + + for (Long keys: sortedIDList2_1) + { WCSS_off_1 = WCSS_off_1 + MapOfIDAandWCSS_offline_1.get(keys);} + + for (Long keys: sortedIDList2_2) + { WCSS_off_2 = WCSS_off_2 + MapOfIDAandWCSS_offline_2.get(keys);} + + for (Long keys: sortedIDList2_3) + { WCSS_off_3 = WCSS_off_3 + MapOfIDAandWCSS_offline_3.get(keys);} + + for (Long keys: sortedIDList2_4) + { WCSS_off_4 = WCSS_off_4 + MapOfIDAandWCSS_offline_4.get(keys);} + + for (Long keys: sortedIDList2_5) + { WCSS_off_5 = WCSS_off_5 + MapOfIDAandWCSS_offline_5.get(keys);} + + System.out.print("wcss1 = " + WCSS1); + System.out.println(" wcss_ofline_1 = " + WCSS_off_1); + + System.out.print("wcss2 = " + WCSS2); + System.out.println(" wcss_ofline_2 = " + WCSS_off_2); + + System.out.print("wcss3 = " + WCSS3); + System.out.println(" wcss_ofline_3 = " + WCSS_off_3); + + System.out.print("wcss4 = " + WCSS4); + System.out.println(" wcss_ofline_4 = " + WCSS_off_4); + + System.out.print("wcss5 = " + WCSS5); + System.out.println(" wcss_ofline_5 = " + WCSS_off_5); + + + float arr[] = {WCSS_off_1, WCSS_off_2, WCSS_off_3, WCSS_off_4, WCSS_off_5}; + int index_of_max = smallest(arr); + + if (index_of_max == 0){ + MapOfIDAndCount = MapOfIDAndCount1; + MapOfIDAndCent = MapOfIDAndCent1; + MapOfIDAndWCSS = MapOfIDAndWCSS1; + denseSetOfIDandCount2 = denseSetOfIDandCount2_1; + System.out.println("winner = tree1"); + } + if (index_of_max == 1){ + MapOfIDAndCount = MapOfIDAndCount2; + MapOfIDAndCent = MapOfIDAndCent2; + MapOfIDAndWCSS = MapOfIDAndWCSS2; + denseSetOfIDandCount2 = denseSetOfIDandCount2_2; + System.out.println("winner = tree2"); + } + if (index_of_max == 2){ + MapOfIDAndCount = MapOfIDAndCount3; + MapOfIDAndCent = MapOfIDAndCent3; + MapOfIDAndWCSS = MapOfIDAndWCSS3; + denseSetOfIDandCount2 = denseSetOfIDandCount2_3; + System.out.println("winner = tree3"); + } + if (index_of_max == 3) { + MapOfIDAndCount = MapOfIDAndCount4; + MapOfIDAndCent = MapOfIDAndCent4; + MapOfIDAndWCSS = MapOfIDAndWCSS4; + denseSetOfIDandCount2 = denseSetOfIDandCount2_4; + System.out.println("winner = tree4"); + } + if (index_of_max == 4) { + MapOfIDAndCount = MapOfIDAndCount5; + MapOfIDAndCent = MapOfIDAndCent5; + MapOfIDAndWCSS = MapOfIDAndWCSS5; + denseSetOfIDandCount2 = denseSetOfIDandCount2_5; + System.out.println("winner = tree5"); + } + + + System.out.println("NumberOfMicroClustersAfterPruning&beforesortingLimit = "+ denseSetOfIDandCount2.size()); + + //remove keys with support less than 1 + Stream> stream2 = denseSetOfIDandCount2.entrySet().stream().filter(p -> p.getValue() > 1); + + List sortedIDList2= new ArrayList<>(); + // sort and limit the list + stream2.sorted(Entry. comparingByValue().reversed()).limit(cutoff) + .forEachOrdered(x -> sortedIDList2.add(x.getKey())); + + Multimap multimapWeightAndCent = ArrayListMultimap.create(); + + for (Long keys: sortedIDList2) + + { + + multimapWeightAndCent.put((Long)(MapOfIDAndCount.get(keys)), (float[]) (MapOfIDAndCent.get(keys))); + + } + + + // this is to be taken out . only done for hypothesis testing. + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + try { + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + Multimap multimapWeightAndCent1 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_1) + { + multimapWeightAndCent1.put((Long)(MapOfIDAndCount1.get(keys)), (float[]) (MapOfIDAndCent1.get(keys))); + } + + Multimap multimapWeightAndCent2 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_2) + { + multimapWeightAndCent2.put((Long)(MapOfIDAndCount2.get(keys)), (float[]) (MapOfIDAndCent2.get(keys))); + } + + Multimap multimapWeightAndCent3 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_3) + { + multimapWeightAndCent3.put((Long)(MapOfIDAndCount3.get(keys)), (float[]) (MapOfIDAndCent3.get(keys))); + } + + Multimap multimapWeightAndCent4 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_4) + { + multimapWeightAndCent4.put((Long)(MapOfIDAndCount4.get(keys)), (float[]) (MapOfIDAndCent4.get(keys))); + } + + Multimap multimapWeightAndCent5 = ArrayListMultimap.create(); + for (Long keys: sortedIDList2_5) + { + multimapWeightAndCent5.put((Long)(MapOfIDAndCount5.get(keys)), (float[]) (MapOfIDAndCent5.get(keys))); + } + + + Listcentroids1 = new ArrayList<>(); + List weights1 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent1.keys()) + { + weights1.add((float)weights); + } + + for (Long weight : multimapWeightAndCent1.keySet()) + + { + centroids1.addAll(multimapWeightAndCent1.get(weight)); + } + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids1, so.getk()); +// aggloOffline.setWeights(weights1); +// List finalcentroids_1 = aggloOffline.getCentroids(); + + KMeans2 Offline = new KMeans2(); + Offline.setK(so.getk()); + Offline.setRawData(centroids1); + Offline.setWeights(weights1); + List finalcentroids_1 = Offline.getCentroids(); + + + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent2.keys()) + { + weights2.add((float)weights); + } + + for (Long weight : multimapWeightAndCent2.keySet()) + + { + centroids2.addAll(multimapWeightAndCent1.get(weight)); + } + +// Agglomerative3 aggloOffline2 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline2.setWeights(weights2); +// List finalcentroids_2 = aggloOffline2.getCentroids(); + + KMeans2 Offline2 = new KMeans2(); + Offline2.setK(so.getk()); + Offline2.setRawData(centroids2); + Offline2.setWeights(weights2); + List finalcentroids_2 = Offline2.getCentroids(); + + + Listcentroids3 = new ArrayList<>(); + List weights3 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent3.keys()) + { + weights3.add((float)weights); + } + + for (Long weight : multimapWeightAndCent3.keySet()) + + { + centroids3.addAll(multimapWeightAndCent3.get(weight)); + } + +// Agglomerative3 aggloOffline3 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids3, so.getk()); +// aggloOffline3.setWeights(weights3); +// List finalcentroids_3 = aggloOffline3.getCentroids(); + + KMeans2 Offline3 = new KMeans2(); + Offline3.setK(so.getk()); + Offline3.setRawData(centroids3); + Offline3.setWeights(weights3); + List finalcentroids_3 = Offline3.getCentroids(); + + Listcentroids4 = new ArrayList<>(); + List weights4 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent4.keys()) + { + weights4.add((float)weights); + } + + for (Long weight : multimapWeightAndCent4.keySet()) + + { + centroids4.addAll(multimapWeightAndCent4.get(weight)); + } + +// Agglomerative3 aggloOffline4 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids4, so.getk()); +// aggloOffline4.setWeights(weights4); +// List finalcentroids_4 = aggloOffline4.getCentroids(); + + KMeans2 Offline4 = new KMeans2(); + Offline4.setK(so.getk()); + Offline4.setRawData(centroids4); + Offline4.setWeights(weights4); + List finalcentroids_4 = Offline4.getCentroids(); + + + Listcentroids5 = new ArrayList<>(); + List weights5 =new ArrayList<>(); + for (Long weights : multimapWeightAndCent5.keys()) + { + weights5.add((float)weights); + } + + for (Long weight : multimapWeightAndCent5.keySet()) + + { + centroids5.addAll(multimapWeightAndCent5.get(weight)); + } + +// Agglomerative3 aggloOffline5 = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids5, so.getk()); +// aggloOffline5.setWeights(weights5); +// List finalcentroids_5 = aggloOffline5.getCentroids(); + + KMeans2 Offline5 = new KMeans2(); + Offline5.setK(so.getk()); + Offline5.setRawData(centroids5); + Offline5.setWeights(weights5); + List finalcentroids_5 = Offline5.getCentroids(); + + + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree1"),finalcentroids_1, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_1, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree2"),finalcentroids_2, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_2, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree3"),finalcentroids_3, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_3, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree4"),finalcentroids_4, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_4, data)); + + VectorUtil.writeCentroidsToFile(new File("/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_tree5"),finalcentroids_5, false); + + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(finalcentroids_5, data)); + + return multimapWeightAndCent; + +} + + public void run() { + rngvec = new float[so.getDimparameter()]; + rngvec2 = new float[so.getDimparameter()]; + rngvec3 = new float[so.getDimparameter()]; + rngvec4 = new float[so.getDimparameter()]; + rngvec5 = new float[so.getDimparameter()]; + + counter = 0; + boolean randVect = so.getRandomVector(); + + // Random r = new Random(so.getRandomSeed()); + // Random r = new Random(3800635955020675334L) ; + Random r = new Random(); + Random r2 = new Random(); + Random r3 = new Random(); + Random r4 = new Random(); + Random r5 = new Random(); + + if (randVect==true){ + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) r.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec2[i] = (float) r2.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec3[i] = (float) r3.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec4[i] = (float) r4.nextGaussian(); + for (int i = 0; i < so.getDimparameter(); i++) + rngvec5[i] = (float) r5.nextGaussian(); + } + + else { + for (int i = 0; i < so.getDimparameter(); i++) + rngvec[i] = (float) 0; + + } + + Multimap WeightAndClusters = findDensityModes2(); + + Listcentroids2 = new ArrayList<>(); + List weights2 =new ArrayList<>(); + + + System.out.println("\tNumberOfMicroClusters_AfterPruning = "+ WeightAndClusters.size()); + System.out.println("getRandomVector = "+ randVect); + + + for (Long weights : WeightAndClusters.keys()) + { + + weights2.add((float)weights); + + } + + + for (Long weight : WeightAndClusters.keySet()) + + { + + centroids2.addAll(WeightAndClusters.get(weight)); + + } + + +// Agglomerative3 aggloOffline = new Agglomerative3(ClusteringType.AVG_LINKAGE,centroids2, so.getk()); +// aggloOffline.setWeights(weights2); +// this.centroids = aggloOffline.getCentroids(); + + KMeans2 aggloOffline2 = new KMeans2(); + aggloOffline2.setK(so.getk()); + aggloOffline2.setRawData(centroids2); + aggloOffline2.setWeights(weights2); + this.centroids = aggloOffline2.getCentroids(); + + } + + + public static void main(String[] args) throws FileNotFoundException, + IOException { + + int k = 10;//6; + int d = 200;//16; + int n = 10000; + float var = 1.5f; + int count = 1; + // System.out.printf("ClusterVar\t"); + // for (int i = 0; i < count; i++) + // System.out.printf("Trial%d\t", i); + // System.out.printf("RealWCSS\n"); + + String Output = "/C:/Users/deysn/Desktop/temp/har/run_results/5runs/OutputTwrpCents_mainfunc_1" ; + + float f = var; + float avgrealwcss = 0; + float avgtime = 0; + // System.out.printf("%f\t", f); + // GenerateData gen = new GenerateData(k, n/k, d, f, true, .5f); + + // gen.writeCSVToFile(new File("/home/lee/Desktop/reclsh/in.csv")); + // List data = "/C:/Users/user/Desktop/temp/OutputTwrpCents1" + + // RPHashObject o = new SimpleArrayReader(gen.data, k); + + boolean raw = Boolean.parseBoolean(("raw")); + List data = null; + data = VectorUtil.readFile("/C:/Users/deysn/Desktop/temp/har/1D.txt", raw); + k = 12; + RPHashObject o = new SimpleArrayReader(data, k); + + + o.setDimparameter(16); + o.setCutoff(60); + o.setRandomVector(true); + +// System.out.println("cutoff = "+ o.getCutoff()); +// System.out.println("get_random_Vector = "+ o.getRandomVector()); + + TWRPv6_wcss_offline2_TEST2_5runs rphit = new TWRPv6_wcss_offline2_TEST2_5runs(o); + long startTime = System.nanoTime(); + List centsr = rphit.getCentroids(); + + avgtime += (System.nanoTime() - startTime) / 100000000; + +// avgrealwcss += StatTests.WCSSEFloatCentroid(gen.getMedoids(),gen.getData()); + + + VectorUtil.writeCentroidsToFile(new File(Output),centsr, false); + +// System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, gen.data)); + System.out.printf("%.0f\t", StatTests.WCSSECentroidsFloat(centsr, data)); + + System.gc(); + +// System.out.printf("%.0f\n", avgrealwcss / count); + + + } + + @Override + public RPHashObject getParam() { + return so; + } + + @Override + public void setWeights(List counts) { + // TODO Auto-generated method stub + + } + + @Override + public void setData(List centroids) { + this.centroids = centroids; + + } + + @Override + public void setRawData(List centroids) { + if (this.centroids == null) + this.centroids = new ArrayList<>(centroids.size()); + for (float[] f : centroids) { + this.centroids.add(new Centroid(f, 0)); + } + } + + @Override + public void setK(int getk) { + this.so.setK(getk); + } + + @Override + public void reset(int randomseed) { + centroids = null; + so.setRandomSeed(randomseed); + } + + @Override + public boolean setMultiRun(int runs) { + return false; + } + + //@Override + public void setCutoff(int getCutoff) { + this.so.setCutoff(getCutoff); + } + + //@Override + public void setRandomVector(boolean getRandomVector) { + this.so.setRandomVector(getRandomVector); + } + + +} diff --git a/src/main/java/edu/uc/rphash/aging/Decay.java b/src/main/java/edu/uc/rphash/aging/Decay.java new file mode 100644 index 0000000..6edd861 --- /dev/null +++ b/src/main/java/edu/uc/rphash/aging/Decay.java @@ -0,0 +1,54 @@ +package edu.uc.rphash.aging; + +public class Decay implements Runnable { + + + // public double value; + public double t; + public double decayRate; + + + @Override + public void run() { + + } + +public static double ExpDecayFormula ( Number halfLifeInSeconds , float t ) { + + Double decayRate = - Math.log(2) / halfLifeInSeconds.longValue() / 1000; + + Double expMultiplier = Math.pow(Math.E, decayRate * t); + return expMultiplier; + + } + +public static double LinearDecayFormula ( Number lifeTimeInSeconds , float t ) { + + + Double lifeTime = Double.valueOf(lifeTimeInSeconds.longValue()) * 1000; + + if (t < 0 || t > lifeTime ) { + Double linearMultiplier = -0.1; // explain + return linearMultiplier; + } + else { + Double linearMultiplier =(1 - t / lifeTime); + return linearMultiplier; + } + +} + +public static double LogDecayFormula (long lifeTimeInSeconds , float t) { + + + Double lifeTime = Double.valueOf(lifeTimeInSeconds) * 1000; + + if (t < 0 || t >= lifeTime ) { + return 0.0; + } else { + // return value + 1 - Math.pow(Math.E, Math.log(value + 1)/lifeTime*t); + return lifeTime; + } + } + +} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/aging/DecayPositional.java b/src/main/java/edu/uc/rphash/aging/DecayPositional.java new file mode 100644 index 0000000..197ba08 --- /dev/null +++ b/src/main/java/edu/uc/rphash/aging/DecayPositional.java @@ -0,0 +1,65 @@ +package edu.uc.rphash.aging; + +public class DecayPositional implements Runnable { + + // public double value; + public double t; + public double decayRate; + + @Override + public void run() { + + } + +public static double ExpDecayFormula ( Number halfLifeInSeconds , float t ) { + + Double decayRate = - Math.log(2) / halfLifeInSeconds.longValue() / 1000; + Double expMultiplier = Math.pow(Math.E, decayRate * t); + return expMultiplier; + + } + +public static double ExpDecayFormula2 ( double decayRate , float t ) { + decayRate = -1*decayRate; + Double expMultiplier = Math.pow(Math.E, decayRate * t); + return expMultiplier; + +} +public static double LinearDecayFormula ( Number lifeTimeInSeconds , float t ) { + + Double lifeTime = Double.valueOf(lifeTimeInSeconds.longValue()) * 1000; + + if (t < 0 || t > lifeTime ) { + Double linearMultiplier = -0.1; // explain + return linearMultiplier; + } + else { + Double linearMultiplier =(1 - t / lifeTime); + return linearMultiplier; + } +} + +public static double LogDecayFormula (long lifeTimeInSeconds , float t) { + + Double lifeTime = Double.valueOf(lifeTimeInSeconds) * 1000; + + if (t < 0 || t >= lifeTime ) { + return 0.0; + } else { + // return value + 1 - Math.pow(Math.E, Math.log(value + 1)/lifeTime*t); + return lifeTime; + } + } + +public static void main(String[] args) +{ + //Number halfLifeInSeconds = 0.1; + double decayRate = 0.5; + float t = 2 ; + +// double expmul = ExpDecayFormula ( halfLifeInSeconds , t ); + double expmul2 = ExpDecayFormula2( decayRate , t ); + System.out.print(expmul2); +} + +} diff --git a/src/main/java/edu/uc/rphash/aging/ageCentriods.java b/src/main/java/edu/uc/rphash/aging/ageCentriods.java new file mode 100644 index 0000000..fe5a601 --- /dev/null +++ b/src/main/java/edu/uc/rphash/aging/ageCentriods.java @@ -0,0 +1,91 @@ +package edu.uc.rphash.aging; + +import java.util.List; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.aging.DecayPositional; + + +public class ageCentriods implements Runnable { + + static double decayRate = 0.5 ; + static DecayPositional decay = new DecayPositional(); + + @Override + public void run() { + // TODO Auto-generated method stub + + } + + + public static float[][] weighted_merge(double cnt_1, float[] x_1, + double cnt_2, float[] x_2) { + + + cnt_1 = (float) cnt_1; + cnt_2 = (float) cnt_2; + + + float cnt_r = (float) (cnt_1 + cnt_2); + float[] x_r = new float[x_1.length]; + + for (int i = 0; i < x_1.length; i++) { + x_r[i] = (float) ((cnt_1 * x_1[i] + cnt_2 * x_2[i]) / cnt_r); + + } + + float[][] ret = new float[3][]; + ret[0] = new float[1]; + + ret[0][0] = cnt_r; + ret[1] = x_r; + return ret; + } + + public static List> ageListOfcent( List> prev ) { + + + for (int i = 0; i < prev.size(); i++) + { + + double ageMultiplier= decay.ExpDecayFormula2 ( decayRate , i ); + List tempCents = prev.get(i); + + for (int j =0 ; j ageListOfcents2( List prev , List curr) { + + + for (int i = 0; i < prev.size(); i++) + { + + double ageMultiplier= decay.ExpDecayFormula2 ( decayRate , i ); + List tempCents = (List) prev.get(i); + + for (int j =0 ; j > ageListOfMicroClusters( List< HashMap > Maps_OfIDAndCount ) { + + // HashMap MapOfIDAndCount1 = new HashMap<>(); + for (int i = 0; i < Maps_OfIDAndCount.size(); i++) + { + + double ageMultiplier= decay.ExpDecayFormula2 ( decayRate , i ); + HashMap MapOfIDAndCount = Maps_OfIDAndCount.get(i); + + for (Long cur_id : new TreeSet(MapOfIDAndCount.keySet())) { + + int cur_count = (int) (MapOfIDAndCount.get(cur_id).longValue()); + + cur_count = (int) (cur_count * ageMultiplier); + } + } + + return Maps_OfIDAndCount; + } +} diff --git a/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java b/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java new file mode 100644 index 0000000..89c3fda --- /dev/null +++ b/src/main/java/edu/uc/rphash/centroidTracker/trackCentroids.java @@ -0,0 +1,173 @@ +package edu.uc.rphash.centroidTracker; + +import java.util.List; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; +import edu.uc.rphash.lsh.LSH; + +/* +1. Check the number of previous centroids and current centroids. +2. Case i. If the previous centroids = current centroids +Compute a distance matrix ( Euclidean, Cosine ) between the two sets of centroids. +Assign each one to its closest one. + Case ii. If the previous centroids > current centroids + Compute the distance matrix between two sets. + Case a. find closest one and assign movements. Find 2nd closest ones to them and assign them merged. + + Case iii. If Previous centroids < current centroids +Compute the distance matrix between two sets. + Case a. find closest one and assign movements and declare the remaining as new. +*/ + +public class trackCentroids implements Runnable { + +// private float[] vec; +// private float[][] dismtx; + + public trackCentroids(float[] vec, LSH[] lshfuncs) { + + } + + // This function returns the square of the euclidean distance. + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); +// return (float) Math.sqrt(dist); + return dist; + } + + // This function returns the cosine dot distance. + public static float dot(float[] t, float[] u) { + float s = 0; + for (int i = 0; i < t.length; i++) { + s += t[i] * u[i]; + } + return s; + } + + /* + computes the distance matrix between the set of centroids. + + + */ + + + + public static float[][] createDistanceMatrix( List prev , List curr) { + + float[][] dismtx = new float[prev.size()][curr.size()+3] ; + int currcent=-1; + int prevcent =-1; + + for (int i = 0; i < prev.size(); i++) + { + + float mindis= distancesq(prev.get(i).centroid() , curr.get(0).centroid()); + for (int j = 0; j < curr.size(); j++) { + + dismtx[i][j]= distancesq(prev.get(i).centroid() , curr.get(j).centroid()); + if (dismtx[i][j]<= mindis) { + mindis = dismtx[i][j]; + prevcent=i; + currcent=j; + } + } + dismtx[i][curr.size()+3] = mindis; + dismtx[i][curr.size()+2] = currcent; + dismtx[i][curr.size()+1] = prevcent; + + + } + + return dismtx; + } + + + + public static float[][] createCosineDistanceMatrix( List prev , List curr) { + + float[][] dismtx = new float[prev.size()][curr.size()+3] ; + int currcent=-1; + int prevcent =-1; + + for (int i = 0; i < prev.size(); i++) + { + + float mindis= dot(prev.get(i).centroid() , curr.get(0).centroid()); + for (int j = 0; j < curr.size(); j++) { + + dismtx[i][j]= dot(prev.get(i).centroid() , curr.get(j).centroid()); + if (dismtx[i][j]<= mindis) { + mindis = dismtx[i][j]; + prevcent=i; + currcent=j; + } + } + dismtx[i][curr.size()+3] = mindis; + dismtx[i][curr.size()+2] = currcent; + dismtx[i][curr.size()+1] = prevcent; + + + } + + return dismtx; + } + + + + @Override + public void run() { + // TODO Auto-generated method stub + + } + + + + public static float[][] mappingcents( List prev , List curr) { + + float[][] mapping1 = new float[prev.size()][curr.size()]; + float[][] mapping2 = new float[prev.size()][curr.size()]; + + float[][] dismtx_euclid=createDistanceMatrix(prev, curr); + float[][] dismtx_cosine=createCosineDistanceMatrix(prev, curr); + + if (prev.size()==curr.size()) + { + + mapping1=dismtx_euclid; + mapping2=dismtx_cosine; + + }; + + if (prev.size()curr.size()) // centroids may have merged and formed + { + + mapping1=dismtx_euclid; + mapping2=dismtx_cosine; + + }; + return mapping1; + + } + + +} + + + + diff --git a/src/main/java/edu/uc/rphash/decoders/Golay.java b/src/main/java/edu/uc/rphash/decoders/Golay.java deleted file mode 100644 index 6392790..0000000 --- a/src/main/java/edu/uc/rphash/decoders/Golay.java +++ /dev/null @@ -1,350 +0,0 @@ -package edu.uc.rphash.decoders; - -import java.util.Arrays; -import java.util.Random; - -import edu.uc.rphash.frequentItemSet.Countable; -import edu.uc.rphash.standardhash.MurmurHash; -import edu.uc.rphash.util.VectorUtil; - -public class Golay implements Decoder{ - /** - * Utility methods that converts a binary string into and int. - * - * @param str a string containing a binary number - * - * @return the numeric value of the supplied string - */ - - private static int fromBinary(final String str) { - return Integer.parseInt(str, 2); - } - - /** - * Mask that preserves the last 12 bits (bits in dataword). - */ - - private static final int MASK = 0xfff; //== fromBinary("111111111111"); - - /** - * Generator matrix for the code, multiplied with a dataword to generate a codeword. - */ - - private static final int[] sGenerator = { - - fromBinary("100000000000"), - fromBinary("010000000000"), - fromBinary("001000000000"), - fromBinary("000100000000"), - fromBinary("000010000000"), - fromBinary("000001000000"), - fromBinary("000000100000"), - fromBinary("000000010000"), - fromBinary("000000001000"), - fromBinary("000000000100"), - fromBinary("000000000010"), - fromBinary("000000000001"), - - /* ALTERNATIVE MATRIX - UNUSED - fromBinary("110111000101"), - fromBinary("101110001011"), - fromBinary("011100010111"), - fromBinary("111000101101"), - fromBinary("110001011011"), - fromBinary("100010110111"), - fromBinary("000101101111"), - fromBinary("001011011101"), - fromBinary("010110111001"), - fromBinary("101101110001"), - fromBinary("011011100011"), - fromBinary("111111111110"), - */ - - fromBinary("011111111111"), - fromBinary("111011100010"), - fromBinary("110111000101"), - fromBinary("101110001011"), - fromBinary("111100010110"), - fromBinary("111000101101"), - fromBinary("110001011011"), - fromBinary("100010110111"), - fromBinary("100101101110"), - fromBinary("101011011100"), - fromBinary("110110111000"), - fromBinary("101101110001"), - }; - - /** - * Transpose of the generator matrix, multiplied with a codeword to generate a syndrome. - */ - - private static final int[] sCheck = { - - fromBinary("011111111111100000000000"), - fromBinary("111011100010010000000000"), - fromBinary("110111000101001000000000"), - fromBinary("101110001011000100000000"), - fromBinary("111100010110000010000000"), - fromBinary("111000101101000001000000"), - fromBinary("110001011011000000100000"), - fromBinary("100010110111000000010000"), - fromBinary("100101101110000000001000"), - fromBinary("101011011100000000000100"), - fromBinary("110110111000000000000010"), - fromBinary("101101110001000000000001"), - - }; - - /** - * A 4096 (2^12) element array that maps datawords to codewords. - */ - - private static final int[] sCodewords; - - /** - * A 4096 (2^12) element array that maps syndromes to error bits. - */ - - private static final int[] sErrors; - - //static initialization - static { - sCodewords = computeCodewords(); - sErrors = computeErrors(); - } - - /** - * Generates the codewords array. - * - * @return an array for assignment to {@link sCodewords} - */ - - private static int[] computeCodewords() { - int[] cws = new int[4096]; - //iterate over all valid datawords - for (int i = 0; i < 4096; i++) { - //multiply dataword by generator matrix - int cw = 0; - for (int j = 0; j < 24; j++) { - int d = i & sGenerator[j]; - int p = Integer.bitCount(d); - cw = (cw << 1) | (p & 1); - } - //store resulting codeword - cws[i] = cw; - } - return cws; - } - - /** - * Generates error array. - * - * @return an array for assignment to {@link sErrors} - */ - - private static int[] computeErrors() { - int[] errors = new int[4096]; - //fill array with -1 (indicates that error cannot be corrected - Arrays.fill(errors, -1); - - //record syndrome for zero error (valid) word - { - int error = 0; - int syn = syndrome(error); - errors[syn] = error; - } - - //record syndrome for each single error word - for (int i = 0; i < 24; i++) { - int error = 1 << i; - int syn = syndrome(error); - errors[syn] = error; - } - - //record syndrome for each double error word - for (int i = 1; i < 24; i++) { - for (int j = 0; j < i; j++) { - int error = (1 << i) | (1 << j); - int syn = syndrome(error); - errors[syn] = error; - } - } - - //record syndrome for each triple error word - for (int i = 2; i < 24; i++) { - for (int j = 1; j < i; j++) { - for (int k = 0; k < j; k++) { - int error = (1 << i) | (1 << j) | (1 << k); - int syn = syndrome(error); - errors[syn] = error; - } - } - } - - //code can't resolve quadruple errors - return errors; - } - - /** - * Encodes a 12 bit data word into a codeword. The 12 bits must be in the - * least significant positions and all other supplied bits must be zero. - * - * @param data a 12 bit data word - * @return the 24 bit code word - */ - - public static int encode(final int data) { - return sCodewords[data]; - } - - /** - * Computes the syndrome for the supplied codeword. The 24 bits must be in - * the least significant positions. - * - * @param word a candidate code word - * @return the syndrome for the supplied word - */ - - public static int syndrome(final int word) { - //multiply codeword by the check matrix - int syndrome = 0; - for (int j = 0; j < 12; j++) { - int d = word & sCheck[j]; - int p = Integer.bitCount(d); - syndrome = (syndrome << 1) | (p & 1); - } - return syndrome; - } - - /** - * Whether the supplied candidate code word is a valid code word. The 24 - * bits must be in the least significant positions and all other supplied - * bits must be zero. - * - * @param word the candidate code word - * @return true iff the supplied word is a valid codeword - */ - - public static boolean isCodeword(final int word) { - //optimization - is it worth it? - int w = Integer.bitCount(word); - if (w != 0 && w != 8 && w != 12 && w != 16 && w != 24) return false; - return syndrome(word) == 0; - } - - /** - * Decodes a valid code word into a dataword. - * - * @param codeword a valid code word - * @return the corresponding data word - */ - public static int decodeWord(final int codeword) { - return (codeword >> 12) & MASK; - } - - /** - * Attempts to correct and decode a codeword. The 24 bits must be in the - * least significant positions and all other supplied bits must be zero. - * NOTE: for codewords with four errors, this method does not attempt any correction - * - * @param word a word to be decoded - * @return a decoded and possibly corrected data word - */ - - public static int correctAndDecode(final int word) { - int err = sErrors[ syndrome(word) ]; - //for 4 errors we currently just give up!! - return err <= 0 ? decodeWord(word) : decodeWord(word ^ err); - } - - private float[] variance; - - // constructor - - /** - * Cannot be instantiated. - */ - - public Golay() { } - - - public static void main(String[] args) { - Random r = new Random(); - int d = 24; - - Golay sp = new Golay(); - MurmurHash hash = new MurmurHash(Integer.MAX_VALUE); - float testResolution = 10000f; - - for (int i = 0; i < 300; i++) { - int ct = 0; - float distavg = 0.0f; - for (int j = 0; j < testResolution; j++) { - float p1[] = new float[d]; - float p2[] = new float[d]; - - // generate a vector - for (int k = 0; k < d; k++) { - p1[k] = r.nextFloat() * 2 - 1; - p2[k] = (float) (p1[k] + r.nextGaussian() - * ((float) i / 1000f)); - } - float dist = VectorUtil.distance(p1, p2); - distavg += dist; - - long hp1 = hash.hash(sp.decode(p1)); - long hp2 = hash.hash(sp.decode(p2)); - - ct+=(hp2==hp1)?1:0; - - } - System.out.println(distavg / testResolution + "\t" + (float) ct - / testResolution); - } - } - -// float varTot = 1.0f; - @Override - public long[] decode(float[] p1) { - int codeword = 0; - if(p1[0]>0)codeword+=1; - for(int i=1;i<24;i++){ - codeword<<=1; - if(p1[i]>0)codeword+=1; - } - return new long[]{correctAndDecode(codeword)}; - } - -// @Override -// public void setVariance(float[] parameterObject) { -// variance = parameterObject; -// for(int i = 0 ; i - float[] rndBs; - // Vector> - float[][] stableArray; - - public PsdLSH(int M, int L, int D, int T, float W) { - this.M = M; - this.L = L; - this.T = T; - this.W = W; - this.D = D; - bits = (int) Math.ceil(Math.log(M) / Math.log(2)); - rndBs = new float[L]; - stableArray = new float[L][D]; - initialize(); - } - - public PsdLSH() { - M = 256; - L = 4; - T = GAUSSIAN; - W = 2f; - D = 32; - bits = (int) Math.ceil(Math.log(M) / Math.log(2)); - rndBs = new float[L]; - stableArray = new float[L][D]; - ; - initialize(); - } - - public PsdLSH(int psdtype, int innerDecoderMultiplier) { - M = 256; - L = 4; - T = psdtype; - if (psdtype == LEVY) - W = 2f; - if (psdtype == GAUSSIAN) - W = 1f; - if (psdtype == CAUCHY) - W = 2f; - D = innerDecoderMultiplier; - - bits = (int) Math.ceil(Math.log(M) / Math.log(2)); - rndBs = new float[L]; - stableArray = new float[L][D]; - ; - initialize(); - } - - private void initialize() { - - Random rng = new Random(); - - switch (T) { - case 0: { - LevyDistribution ld = new LevyDistribution(0, 1); - for (int l = 0; l < L; l++) { - int d = 0; - while (d < D) { - stableArray[l][d] = (float) ld.sample(); - if (stableArray[l][d] < 3f && stableArray[l][d] > -3f) { - d++; - } - } - rndBs[l] = rng.nextFloat() * W; - } - return; - } - - case 1: { - CauchyDistribution cd = new CauchyDistribution(); - - for (int l = 0; l < L; l++) { - int d = 0; - while (d < D) { - stableArray[l][d] = (float) cd.sample(); - if (stableArray[l][d] < 3f && stableArray[l][d] > -3f) { - d++; - } - } - - rndBs[l] = rng.nextFloat() * W; - } - return; - } - case 2: { - for (int l = 0; l < L; l++) { - for (int d = 0; d < D; d++) { - stableArray[l][d] = (float) rng.nextGaussian(); - } - rndBs[l] = rng.nextFloat() * W; - } - return; - } - default: { - return; - } - } - } - - long[] hash(float[] v) { - - long[] hashVal = new long[1]; - // long hashVal = 0; - int tmp; - for (int l = 0; l < L; l++) { - // dot product with stable distribution - float sum = rndBs[l]; - for (int d = 0; d < D; d++) { - sum += v[d] * stableArray[l][d]; - } - tmp = ((int) ((sum) / W)); - tmp %= M; - // shift negative number to the other side - hashVal[0] += tmp; - hashVal[0] <<= this.bits; - } - return hashVal; - } - - public static void main(String[] args) { - Random r = new Random(); - // int M = 256; - // int L = 8; - // int T = LEVY; - // float W = 1f; - int d = 24; - - PsdLSH sp = new PsdLSH(); - - // MultiDecoder sp = new MultiDecoder( d, e8); - MurmurHash hash = new MurmurHash(Integer.MAX_VALUE); - float testResolution = 10000f; - - HashMap ctmap = new HashMap(); - - for (int i = 0; i < 400; i++) { - int ct = 0; - float distavg = 0.0f; - for (int j = 0; j < testResolution; j++) { - float p1[] = new float[d]; - float p2[] = new float[d]; - - // generate a vector - for (int k = 0; k < d; k++) { - p1[k] = r.nextFloat() * 2 - 1f; - p2[k] = (float) (p1[k] + r.nextGaussian() - * ((float) i / 1000f)); - } - float dist = VectorUtil.distance(p1, p2); - distavg += dist; - long[] l1 = sp.decode(p1); - long[] l2 = sp.decode(p2); - - ctmap.put(l1[0], - ctmap.containsKey(l1[0]) ? 1 + ctmap.get(l1[0]) : 1); - - long hp1 = hash.hash(l1); - long hp2 = hash.hash(l2); - - // ctmap.put(hp1,ctmap.containsKey(hp1)?1+ctmap.get(hp1):1); - - ct += (hp2 == hp1) ? 1 : 0; - - } - - System.out.println(distavg / testResolution + "\t" + (float) ct - / testResolution); - } - } - -// @Override -// public void setVariance(float[] parameterObject) { -// this.variance = parameterObject; -// } - - @Override - public int getDimensionality() { - return D; - } - - @Override - public long[] decode(float[] f) { - return hash(f); - } - - @Override - public float getErrorRadius() { - return 1; - } - - @Override - public float getDistance() { - return 0; - } - - @Override - public boolean selfScaling() { - return true; - } - - @Override - public void setCounter(Countable counter) { - // TODO Auto-generated method stub - - } - -// @Override -// public float[] getVariance() { -// return this.variance; -// } - -} diff --git a/src/main/java/edu/uc/rphash/decoders/SphericalRandom.java b/src/main/java/edu/uc/rphash/decoders/SphericalRandom.java new file mode 100644 index 0000000..32c4250 --- /dev/null +++ b/src/main/java/edu/uc/rphash/decoders/SphericalRandom.java @@ -0,0 +1,292 @@ +package edu.uc.rphash.decoders; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; + +import edu.uc.rphash.frequentItemSet.Countable; +import edu.uc.rphash.standardhash.MurmurHash; +import edu.uc.rphash.util.VectorUtil; + +/** + * Spherical LSH Decoder based on SLSH (lgpl) + * + * @author lee + * + */ +public class SphericalRandom implements Decoder { + int HashBits = 32; + final List> vAll; // vAll[i][j] is the vector $A_i \tilde v_j$ + // from + // the article. + int hbits; // Ceil(Log2(2*d)). + int d; // the dimension of the feature space. + int k; // number of elementary hash functions (h) to be concatenated to + // obtain a reliable enough hash function (g). LSH queries becomes + // more selective with increasing k, due to the reduced the + // probability of collision. + int l; // number of "copies" of the bins (with a different random matrices). + // Increasing L will increase the number of points the should be + // scanned linearly during query. + float distance = 0; + + /** + * This class represent a spherical lsh scheme. Vectors are decoded to the + * nearest vertex of the d dimensional orthoplex reresented by a canonical + * ordered integer. + * + * @param d + * - the number of dimension in the orthoplex + * @param k + * - number of rotations of the fundamental hash functions + * @param L + * - the number to search, currently ignored in RPHash + */ + public SphericalRandom(int d, int k, int L) { + this.d = d;// number of dimensions + this.k = k;// number of elementary hash functions + this.l = L;// L;//number of copies to search + double nvertex = 2.0 * this.d; + this.hbits = (int) Math.ceil(Math.log(nvertex) / Math.log(2)); + int kmax = (int) (HashBits / this.hbits); + if (this.k > kmax) { + this.k = kmax; + System.out + .printf("k is too big, chopping down (%d->%d)\n", k, kmax); + } + + Random[] r = new Random[d]; + for (int i = 0; i < d; i++) + r[i] = new Random(); + + // For orthoplex, the basis Vectortors v_i are permutations of the + // Vectortor (1, 0, ..., 0), + // and -(1, 0, ..., 0). + // Thus R v_i simply picks up the ith row of the rotation matrix, up to + // a sign. + // This means we don't need any matrix multiplication; R matrix is the + // list of + // rotated vectors itself! + this.vAll = new ArrayList>(k * l); // random rotation + // matrices + for (int i = 0; i < k * l; i++) { + this.vAll.add(i, randomRotation(this.d, r)); + } + } + + @Override + public int getDimensionality() { + return d; + } + + @Override + public long[] decode(float[] f) { + return Hash(f); + } + + @Override + public float getErrorRadius() { + return d; + } + + @Override + public float getDistance() { + return distance; + } + + long argmaxi(float[] p, int index) { + List vs = vAll.get(index); + long maxi = 0; + float max = 0; + for (int i = 0; i < this.d; i++) { + + float dot = dot(p, vs.get(i)); + // compute orthoplex of -1 and 1 simultaneously + + + //float dot = dotshift(p, vs.get(i)); // aas we are using dotshift the full matrix needs storing. incorporate that. + + float abs = dot >= 0 ? dot : -dot; + if (abs < max) { + continue; + } + max = abs; + maxi = dot >= 0 ? i : i + this.d; + } + return maxi; + } + + float norm(float[] t) { + float n = 0; + for (int i = 0; i < t.length; i++) { + n += t[i] * t[i]; + } + return (float) Math.sqrt(n); + } + + float[] scale(float[] t, float s) { + for (int i = 0; i < t.length; i++) { + t[i] *= s; + } + return t; + } + + float dot(float[] t, float[] u) { + float s = 0; + for (int i = 0; i < t.length; i++) { + s += t[i] * u[i]; + } + return s; + } + + + + float dotshift(float[] t, float[] u) { + float s = 0; + for (int i = 0; i < t.length; i++) { + + + s = (float) ( s + ((t[i]*0.2)+0.1) * u[i] ); + } + return s; + } + + + + float[] sub(float[] t, float[] u) { + for (int i = 0; i < t.length; i++) { + t[i] -= u[i]; + } + return t; + } + + float[] random(int d, Random[] r) { + + float[] v = new float[d]; + + for (int i = 0; i < d; i++) { + v[i] = (float) r[i].nextGaussian(); + } + return v; + } + + List randomRotation(int d, Random[] r2) { + ArrayList R = new ArrayList<>(d); + for (int i = 0; i < d; i++) { + R.add(i, random(d, r2)); + float[] u = R.get(i); + for (int j = 0; j < i; j++) { + float[] v = R.get(j); + float vnorm = norm(v); + if (vnorm == 0) { + return randomRotation(d, r2); + } + float[] vs = new float[v.length]; + System.arraycopy(v, 0, vs, 0, v.length); + scale(vs, dot(v, u) / vnorm); + u = sub(u, vs); + } + u = scale(u, 1.0f / norm(u)); + } + return R; + } + + // Hashes a single point slsh.l times, using a different set of + // random matrices created and stored by the constructor for each. + // Stores the result in g to avoid unnecessary allocations. + // + // SLSH requires that all vectors lie on a d-dimensional hypershpere, + // thus having the same norm. Only the Similarity method of FeatureVector + // is required to take the normalization into account. + // + // The complexity of this function is O(nLK) + long[] Hash(float[] p) { + int ri = 0; + long[] h = new long[l]; + float normp = norm(p); + p = scale(p, 1.0f / normp); + for (int i = 0; i < this.l; i++) { + for (int j = 0; j < this.k; j++) { + h[i] = h[i] | this.argmaxi(p, ri); + h[i] <<= this.hbits; + ri++; + } + } + + return h;//+ (int) (normp); + + } + + public static void main(String[] args) { + Random r = new Random(); + int d = 16; + int K = 3; + int L = 1; + Spherical sp = new Spherical(d, K, L); + + // MultiDecoder sp = new MultiDecoder( d, e8); + MurmurHash hash = new MurmurHash(Integer.MAX_VALUE); + float testResolution = 10000f; + + HashMap ctmap = new HashMap(); + + for (int i = 0; i < 400; i++) { + int ct = 0; + float distavg = 0.0f; + for (int j = 0; j < testResolution; j++) { + float p1[] = new float[d]; + float p2[] = new float[d]; + + // generate a vector + for (int k = 0; k < d; k++) { + p1[k] = r.nextFloat() * 2 - 1f; + p2[k] = (float) (p1[k] + r.nextGaussian() + * ((float) i / 1000f)); + } + float dist = VectorUtil.distance(p1, p2); + distavg += dist; + long[] l1 = sp.decode(p1); + long[] l2 = sp.decode(p2); + + ctmap.put(l1[0], + ctmap.containsKey(l1[0]) ? 1 + ctmap.get(l1[0]) : 1); + + long hp1 = hash.hash(l1); + long hp2 = hash.hash(l2); + + // ctmap.put(hp1,ctmap.containsKey(hp1)?1+ctmap.get(hp1):1); + + ct += (hp2 == hp1) ? 1 : 0; + + } + + System.out.println(distavg / testResolution + "\t" + (float) ct + / testResolution); + } + } + + float[] variance; + +// @Override +// public void setVariance(float[] parameterObject) { +// variance = parameterObject; +// } +// +// @Override +// public float[] getVariance(){ +// return variance; +// } + + @Override + public boolean selfScaling() { + return true; + } + + @Override + public void setCounter(Countable counter) { + // TODO Auto-generated method stub + + } + +} diff --git a/src/main/java/edu/uc/rphash/frequentItemSet/KHHCentroidCounterPush.java b/src/main/java/edu/uc/rphash/frequentItemSet/KHHCentroidCounterPush.java deleted file mode 100644 index 819135a..0000000 --- a/src/main/java/edu/uc/rphash/frequentItemSet/KHHCentroidCounterPush.java +++ /dev/null @@ -1,56 +0,0 @@ -package edu.uc.rphash.frequentItemSet; - -import java.util.Iterator; -import java.util.List; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.knee.KneeAlgorithm; - -public class KHHCentroidCounterPush extends KHHCentroidCounter { - - int estimatedKnee = 0; - KneeAlgorithm kne; - - public KHHCentroidCounterPush(float decay, KneeAlgorithm kne) { - super(1000, decay); - this.kne = kne; - } - - /* - * (non-Javadoc) - * - * @see edu.uc.rphash.frequentItemSet.KHHCentroidCounter#getTop() - */ - @Override - public List getTop() { - return super.getTop(); - } - - /** - * @see edu.uc.rphash.frequentItemSet.KHHCentroidCounter#add(edu.uc.rphash.Centroid) - * This method adds a new vector to the khhcounter and performs knee - * finding on the khhset - * @param the - * cluster to be added c - * @return the estimated number of clusters using the provided KneeAlgorithm - * if the estimation changes or -1 if it does not - */ - public int addAndUpdate(Centroid c) { - super.add(c); - // check for new clusters - int size = frequentItems.values().size(); - float[] counts = new float[size]; - Iterator it = frequentItems.values().iterator(); - for (int i = 0; it.hasNext(); i++) { - counts[i] = it.next().getCount(); - } - int tmpknee = kne.findKnee(counts); - if (tmpknee != estimatedKnee) { - estimatedKnee = tmpknee; - return estimatedKnee; - } else { - return -1; - } - } - -} diff --git a/src/main/java/edu/uc/rphash/knee/BiggestMergeKnee.java b/src/main/java/edu/uc/rphash/knee/BiggestMergeKnee.java deleted file mode 100644 index 2980d0d..0000000 --- a/src/main/java/edu/uc/rphash/knee/BiggestMergeKnee.java +++ /dev/null @@ -1,38 +0,0 @@ -package edu.uc.rphash.knee; - -import edu.uc.rphash.util.VectorUtil; - -public class BiggestMergeKnee implements KneeAlgorithm { - - @Override - public int findKnee(float[] data) { - - return data.length/2; - } - - - - - /** - * this function creates a linear model y=alpha*x+beta for the given data - * series x,y. - */ - float[] linest(float[] y) { - - int n = y.length; - float[] x = new float[n]; - for(int i = 0;imaxdist){ - maxdist = tmpdist; - argmax = i; - } - } - return argmax; - } - - - - /** - * this function creates a linear model y=alpha*x+beta for the given data - * series x,y. - */ - float[] linest(float[] y) { - - int n = y.length; - float[] x = new float[n]; - for(int i = 0;i findCandidateIndices(double[][] data, boolean findMinima){ + ArrayList candidates = new ArrayList<>(); + //a coordinate is considered a candidate if both of its adjacent points have y-values + //that are greater or less (depending on whether we want local minima or local maxima) + for (int i = 1; i < data.length - 1; i++) { + double prev = data[i-1][1]; + double cur = data[i][1]; + double next = data[i+1][1]; + boolean isCandidate = (findMinima) ? (prev > cur && next > cur) : (prev < cur && next < cur); + if(isCandidate){ + candidates.add(i); + } + } + return candidates; + } + + + /** + * Find the index in the data the represents a most exaggerated elbow point. + * @param data the data to find an elbow in + * @return The index of the elbow point. + */ + private int findElbowIndex(double[] data){ + + int bestIdx = 0; + double bestScore = 0; + for (int i = 0; i < data.length; i++) { + double score = Math.abs(data[i]); + if(score > bestScore){ + bestScore = score; + bestIdx = i; + } + } + return bestIdx; + } + + /** + * Prepares the data by smoothing, then normalising into unit range 0-1, + * and finally, subtracting the y-value from the x-value. + * @param data The data to prepare. + * @param smoothingWindow Size of the smoothing window. + * @return The normalised data. + */ + private double[][] prepare(double[][] data, int smoothingWindow){ + + //smooth the data to make local minimum/maximum easier to find (this is Step 1 in the paper) + // double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); + double[][] smoothedData2 = Maths.Smooth2d(data); + + // System.out.println("this is the smoothed out data using gaussian kernal -------------------"); + // System.out.println(Arrays.deepToString(smoothedData)); + // System.out.println(data.length); + + // System.out.println(";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;"); + + //// System.out.println("this is the smoothed out data using linear interpolation -------------------"); + //// System.out.println(Arrays.deepToString(smoothedData2)); + + + //prepare the data into the unit range (step 2 of paper) + + //double[][] normalisedData = Maths.minmaxNormalise(smoothedData ); + + double[][] normalisedData = Maths.minmaxNormalise(smoothedData2 ); + + //double[][] normalisedData = Maths.minmaxNormalise(data); + +//// System.out.println("this is the normalized elbow data -------------------"); +//// System.out.println(Arrays.deepToString(normalisedData)); + + //subtract normalised x from normalised y (this is step 3 in the paper) + for (int i = 0; i < normalisedData.length; i++) { + normalisedData[i][1] = normalisedData[i][1] - normalisedData[i][0]; + } + + return normalisedData; + } + + private double computeAverageVarianceX(double[][] data){ + double sumVariance = 0; + for (int i = 0; i < data.length - 1; i++) { + sumVariance += data[i + 1][0] - data[i][0]; + } + return sumVariance / (data.length - 1); + } + + /** + * Uses a heuristic to find what may be an elbow in the 1d data. + * This method is a heuristic so it may return in invalid elbow. + * If you need guarantees use the other method {@link JythonTest#run(double[][], double, int, boolean)} + * @param data The + * @return A possible elbow for this 1d data. + */ + public double findElbowQuick(double[] data){ + if(data.length <= 1){ + return 0; + } + // double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 3)); // original parameter + double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 0)); + + //do kneedle y'-x' (in this case x' is normalised index value) + for (int i = 0; i < normalisedData.length; i++) { + double normalisedIndex = (double)i / data.length; + normalisedData[i] = normalisedData[i] - normalisedIndex; + } + int elbowIdx = findElbowIndex(normalisedData); + return data[elbowIdx]; + } + + /** + * This algorithm finds the so-called elbow/knee in the data. + * See paper: "Finding a Kneedle in a Haystack: Detecting Knee Points in System Behavior" + * for more details. + * @param data The 2d data to find an elbow in. + * @param s How many "flat" points to require before we consider it a knee/elbow. + * @param smoothingWindow The data is smoothed using Gaussian kernel average smoother, this parameter is the window used for averaging + * (higher values mean more smoothing, try 3 to begin with). + * @param findElbows Whether to find elbows or knees. true for elbows and false for knees. + * @return The elbow or knee values. + */ + public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows){ + + if(data.length == 0){ + throw new IllegalArgumentException("Cannot find elbow or knee points in empty data."); + } + if(data[0].length != 2){ + throw new IllegalArgumentException("Cannot run Kneedle, this method expects all data to be 2d."); + } + + ArrayList localMinMaxPts = new ArrayList<>(); + //do steps 1,2,3 of the paper in the prepare method + double[][] normalisedData = prepare(data, smoothingWindow); + + //find candidate indices (this is step 4 in the paper) + { + ArrayList candidateIndices = findCandidateIndices(normalisedData, findElbows); + //ArrayList candidateIndices = findCandidateIndices(data, findElbows); + //go through each candidate index, i, and see if the indices after i are satisfy the threshold requirement + //(this is step 5 in the paper) + double step = computeAverageVarianceX(normalisedData); + step = findElbows ? step * s : step * -s; + + //check each candidate to see if it is a real elbow/knee + //(this is step 6 in the paper) + for (int i = 0; i < candidateIndices.size(); i++) { + Integer candidateIdx = candidateIndices.get(i); + Integer endIdx = (i + 1 < candidateIndices.size()) ? candidateIndices.get(i+1) : data.length; + + double threshold = normalisedData[candidateIdx][1] + step; + + for (int j = candidateIdx + 1; j < endIdx; j++) { + boolean isRealElbowOrKnee = (findElbows) ? + normalisedData[j][1] > threshold : normalisedData[j][1] < threshold; + if(isRealElbowOrKnee) { + localMinMaxPts.add(data[candidateIdx]); + break; + } + } + } + } + return localMinMaxPts; + } + + +// method to call to find elbow + + public int find_elbow( List counts ){ + + int first_elbow; + int size_of_list = counts.size(); + int cutoff = 0; + // System.out.print("\n" + " size_of_list : " + size_of_list); + + // if(size_of_list >= 100){ + // cutoff = 100; + // } + // if(size_of_list < 100){ + // cutoff = size_of_list ; + // } + + cutoff =size_of_list; + //System.out.print("\n" + " cutoff : " + cutoff + "\n"); + //// System.out.print(" cutoff : " + cutoff + "\n"); + + List counts1 = counts; +//// System.out.print("\n" + " elbow values before smoothing : "+"\n" + counts1 + "\n"); + + double[][] elbowdata = new double[cutoff][2] ; + + for (int i= 0;i<(cutoff-1);i++) { + + elbowdata[i][1]= (cutoff-1)-i;} // index + + for (int i= 0;i run(double[][] data, double s, int smoothingWindow, boolean findElbows) + List list_of_elbows= new ArrayList<>(); + +ArrayList elbows = run ( elbowdata, 1 , 0, false); + + + + +//// System.out.print("\n" + "number of elbow points : " + elbows.size()); +for (double[] point : elbows) { +//System.out.print("\n" +"Knee point:" + Arrays.toString(point)); +//System.out.println("\n" +"No. of clusters complement = " + point[1] ); +//System.out.println("\n" + "No. of clusters = " + (elbowdata.length - point[1])); + +list_of_elbows.add(elbowdata.length - point[1]); + } + + +first_elbow = (int) list_of_elbows.get(0).intValue(); + +return first_elbow ; + + } + + + + +// to test the funtion : + public static void main(String[] args){ + + JythonTest elbowcalculator = new JythonTest(); + + double elbowdata[]= new double[90]; + + for (int i=0 ; i<=89; i++) + { + elbowdata[i] = 89-i; + } + + +/* double elbowdata2 [] = + { 7304, 6978, 6666, 6463, 6326, 6048, 6032, 5762, 5742, + 5398, 5256, 5226, 5001, 4941, 4854, 4734, 4558, 4491, + 4411, 4333, 4234, 4139, 4056, 4022, 3867, 3808, 3745, + 3692, 3645, 3618, 3574, 3504, 3452, 3401, 3382, 3340, + 3301, 3247, 3190, 3179, 3154, 3089, 3045, 2988, 2993, + 2941, 2875, 2866, 2834, 2785, 2759, 2763, 2720, 2660, + 2690, 2635, 2632, 2574, 2555, 2545, 2513, 2491, 2496, + 2466, 2442, 2420, 2381, 2388, 2340, 2335, 2318, 2319, + 2308, 2262, 2235, 2259, 2221, 2202, 2184, 2170, 2160, + 2127, 2134, 2101, 2101, 2066, 2074, 2063, 2048, 2031 }; +*/ + double elbowdata2[] = {5000, + 4000, + 3000, + 2000, + 1000, + 900, + 800, + 700, + 600, + 500, + 450, + 400, + 350, + 300, + 250, + 225, + 200, + 175, + 150, + 125, + 100, + 75, + 50, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 10, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 8, + } ; + + double elbow_point = elbowcalculator.findElbowQuick(elbowdata2); + + System.out.print("elbow point value form 1D data : "+ elbow_point); + + double[][] elbowdata3 = new double[50][2] ; + for (int i= 0;i<=49;i++) { + + elbowdata3[i][1]= 49-i;} + + for (int i= 0;i<=49;i++) + { + elbowdata3[i][0]= elbowdata2[i]; + } + // System.out.print("\n" +"elbowdata3 : " + elbowdata3[88][1]); + + // public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows) + + ArrayList elbows = elbowcalculator.run ( elbowdata3, 1 , 0, false); + + System.out.print("\n" + "number of elbow points : " + elbows.size()); + for (double[] point : elbows) { + System.out.print("\n" +"Knee point:" + Arrays.toString(point)); + System.out.println("\n" +"No. of clusters complement = " + point[1] ); + System.out.println("\n" + "No. of clusters = " + (elbowdata3.length - point[1])); + } + + +// +// double[][] testData = new double[][]{ +// new double[]{0,0}, +// new double[]{0.1, 0.55}, +// new double[]{0.2, 0.75}, +// new double[]{0.35, 0.825}, +// new double[]{0.45, 0.875}, +// new double[]{0.55, 0.9}, +// new double[]{0.675, 0.925}, +// new double[]{0.775, 0.95}, +// new double[]{0.875, 0.975}, +// new double[]{1,1} +// }; +// +// +// ArrayList kneePoints = new Kneedle().run(testData, 1, 1, false); +// +// for (double[] kneePoint : kneePoints) { +// System.out.println(); +// System.out.print("Knee point:" + Arrays.toString(kneePoint)); +// } +// +// +// double[][] testData2 = new double[][]{ +// new double[] { 200 , 9 }, +// new double[] { 100 , 8 }, +// new double[] { 75 , 7 }, +// new double[] { 50 , 6 }, +// new double[] { 48 , 5 }, +// new double[] { 45 , 4 }, +// new double[] { 42 , 3 }, +// new double[] { 40 , 2 }, +// new double[] { 39 , 1 }, +// new double[] { 38 , 0 } +// +// +// }; +// System.out.print("\n" + testData2[9][0]); +// +//// public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows) +// ArrayList kneePoints2 = new Kneedle().run(testData2, 0, 1, false); +// +// for (double[] point : kneePoints2) { +// System.out.print("\n" +"Knee point:" + Arrays.toString(point)); +// System.out.println("\n" +"No. of clusters = " + point[1] ); +// System.out.println("\n" + "No. of clusters = " + (testData2.length - point[1])); +// } + + + + + + + } + + +} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java new file mode 100644 index 0000000..30c6e06 --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest2.java @@ -0,0 +1,83 @@ +package edu.uc.rphash.kneefinder; +import org.python.util.PythonInterpreter; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +import org.python.core.*; + +class JythonTest2 +{ + +//// does not work if there are external imports: + +// public static void main(String[] args) { +// PythonInterpreter interpreter = new PythonInterpreter(); +// +// interpreter.execfile("C:\\Users\\sayan\\eclipse-workspace\\pythonfunc\\pythonfunc1.py"); +// PyFunction function = (PyFunction)interpreter.get("my_test",PyFunction.class); +// PyObject pyobject = function.__call__(new PyString("huzhiweiww"),new PyString("2225")); +// System.out.println("anwser = " + pyobject.toString()); +// } +// + + + + public static void main(String[] args) { + + + + // xarray_1 = + // yarray_2= +/* String[] arguments = new String[] {"python", "C:\\Users\\sayan\\eclipse-workspace\\pythonfunc\\pythonfunc2.py" , "huzhiwei", "25", "C:/Users/sayan/Documents/testdata/data.xlsx"}; + try { + Process process = Runtime.getRuntime().exec(arguments); + BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line = null; + while ((line = in.readLine()) != null) { + System.out.println(line); + } + in.close(); + int re = process.waitFor(); + System.out.println(re); + } catch (Exception e) { + e.printStackTrace(); + } +*/ + String[] arguments2 = new String[] {"python", "C:\\Users\\sayan\\git\\rphash-java\\src\\main\\java\\edu\\uc\\rphash\\kneefinder\\KneeLocator.py" , "huzhiwei", "25", "C:/Users/sayan/Documents/testdata/data.xlsx"}; + try { + Process process = Runtime.getRuntime().exec(arguments2); + BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line = null; + while ((line = in.readLine()) != null) { + System.out.println(line); + } + in.close(); + int re = process.waitFor(); + System.out.println(re); + } catch (Exception e) { + e.printStackTrace(); + } + + int[] int_array_x = new int[] {1,2,3,4,5, 6,7,8,9,10,11,12,13,14,15,16,17,18 ,19,20,21}; + float[] float_array_y = new float[] {5000,4000,3000,2000,1000,900,800,700,600,500,450,400,350,300,250,225,200,175,150,125,100}; + + String[] arguments3 = new String[] {"python", "C:\\Users\\sayan\\git\\rphash-java\\src\\main\\java\\edu\\uc\\rphash\\kneefinder\\KneeLocator.py" , "huzhiwei", "25", "C:/Users/sayan/Documents/testdata/data.xlsx"}; + try { + Process process = Runtime.getRuntime().exec(arguments2); + BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line = null; + while ((line = in.readLine()) != null) { + System.out.println(line); + } + in.close(); + int re = process.waitFor(); + System.out.println(re); + } catch (Exception e) { + e.printStackTrace(); + } + + + + } +} diff --git a/src/main/java/edu/uc/rphash/kneefinder/JythonTest3.java b/src/main/java/edu/uc/rphash/kneefinder/JythonTest3.java new file mode 100644 index 0000000..7c8b069 --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/JythonTest3.java @@ -0,0 +1,58 @@ +package edu.uc.rphash.kneefinder; +/* + * + * import org.python.util.PythonInterpreter; + * import org.python.core.PyInstance; + * + * import java.io.BufferedReader; import java.io.InputStreamReader; + * + * import org.python.core.*; + * + * class JythonTest3 { + * + * //// does not work if there are external imports: + * + * // public static void main(String[] args) { // PythonInterpreter interpreter + * = new PythonInterpreter(); // // interpreter.execfile( + * "C:\\Users\\sayan\\eclipse-workspace\\pythonfunc\\pythonfunc1.py"); // + * PyFunction function = + * (PyFunction)interpreter.get("my_test",PyFunction.class); // PyObject pyobject + * = function.__call__(new PyString("huzhiweiww"),new PyString("2225")); // + * System.out.println("anwser = " + pyobject.toString()); // } // + * + * static PythonInterpreter interpreter; + * + * @SuppressWarnings("resource") public static void main( String gargs[] ) { + * //String[] s = {"New York", "Chicago" , "errr"}; int[] s = new int[] + * {1,2,3,4,5, 6,7,8,9,10,11,12,13,14,15,16,17,18 ,19,20,21}; + * PythonInterpreter.initialize(System.getProperties(),System.getProperties(), + * s); interpreter = new PythonInterpreter(); interpreter.execfile( + * "C:\\Users\\sayan\\git\\rphash-java\\src\\main\\java\\edu\\uc\\rphash\\kneefinder\\PyScript.py" + * ); PyInstance hello = (PyInstance) interpreter.eval("PyScript" + "(" + "None" + * + ")"); } + * + * public void getData(Object[] data) { for (int i = 0; i < data.length; i++) { + * System.out.print(data[i].toString()); } + * + * } } + * + */ + + import org.python.util.PythonInterpreter; + import org.python.core.*; + + public class JythonTest3 { + public static void main(String a[]){ + + PythonInterpreter python = new PythonInterpreter(); + + int number1 = 5; + int number2 = 6; + + python.set("number1", new PyInteger(number1)); + python.set("number2", new PyInteger(number2)); + python.exec("number3 = number1+number2"); + PyObject number3 = python.get("number3"); + System.out.println("Returned Value is : "+number3.toString()); + } + } \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py new file mode 100644 index 0000000..9a6c11a --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/KneeLocator.py @@ -0,0 +1,774 @@ +import sys +from scipy.constants import convert_temperature + +import numpy as np +from scipy import interpolate +from scipy.signal import argrelextrema +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression +import warnings +from typing import Tuple, Optional, Iterable +import matplotlib.pyplot as plt +import pandas as pd + +import warnings # did not install + +#from edu.uc.rphash.kneefinder import JythonTest2 + + +def my_test(name, age, file): + + filename=file + + print(filename) + + print("name: "+name) + + print("age: "+age) + + print("2^10 : ") + + print( np.power(2,10)) + +# temperature=convert_temperature(np.array([-40, 40]), "Celsius", "Kelvin") + +# print(temperature) + + + + return filename + + + +#my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling + +#my_test("sam","25", "name") # this is for the python test + + + + + + + +def set_data( x,y): + + x_data = x + + y_data = y + + return (x_data , y_data) + + + + + +# knee test code : + + + +class KneeLocator(object): + + + + + + def __init__( + + self, + + x: Iterable[float], + + y: Iterable[float], + + S: float = 1.0, + + curve: str = "concave", + + direction: str = "increasing", + + interp_method: str = "interp1d", + + online: bool = False, + + + + + + ): + + """ + + Once instantiated, this class attempts to find the point of maximum + + curvature on a line. The knee is accessible via the `.knee` attribute. + + :param x: x values. + + :param y: y values. + + :param S: Sensitivity, original paper suggests default of 1.0 + + :param curve: If 'concave', algorithm will detect knees. If 'convex', it + + will detect elbows. + + :param direction: one of {"increasing", "decreasing"} + + :param interp_method: one of {"interp1d", "polynomial"} + + :param online: Will correct old knee points if True, will return first knee if False + + """ + + # Step 0: Raw Input + + self.x = np.array(x) + + self.y = np.array(y) + + self.curve = curve + + self.direction = direction + + self.N = len(self.x) + + self.S = S + + self.all_knees = set() + + self.all_norm_knees = set() + + self.all_knees_y = [] + + self.all_norm_knees_y = [] + + self.online = online + + + + + + # Step 1: fit a smooth line + + if interp_method == "interp1d": + + uspline = interpolate.interp1d(self.x, self.y) + + self.Ds_y = uspline(self.x) + print("this is the smoothed data---------------------------") + print(self.Ds_y) + + + elif interp_method == "polynomial": + + pn_model = PolynomialFeatures(7) + + xpn = pn_model.fit_transform(self.x.reshape(-1, 1)) + + regr_model = LinearRegression() + + regr_model.fit(xpn, self.y) + + self.Ds_y = regr_model.predict( + + pn_model.fit_transform(self.x.reshape(-1, 1)) + + ) + + else: + + raise ValueError( + + "{} is an invalid interp_method parameter, use either 'interp1d' or 'polynomial'".format( + + interp_method + + ) + + ) + + + + # Step 2: normalize values + + self.x_normalized = self.__normalize(self.x) + + self.y_normalized = self.__normalize(self.Ds_y) + + + + # Step 3: Calculate the Difference curve + + self.x_normalized, self.y_normalized = self.transform_xy( + + self.x_normalized, self.y_normalized, self.direction, self.curve + + ) + + # normalized difference curve + + self.y_difference = self.y_normalized - self.x_normalized + + self.x_difference = self.x_normalized.copy() + + + + # Step 4: Identify local maxima/minima + + # local maxima + + self.maxima_indices = argrelextrema(self.y_difference, np.greater_equal)[0] + + self.x_difference_maxima = self.x_difference[self.maxima_indices] + + self.y_difference_maxima = self.y_difference[self.maxima_indices] + + + + # local minima + + self.minima_indices = argrelextrema(self.y_difference, np.less_equal)[0] + + self.x_difference_minima = self.x_difference[self.minima_indices] + + self.y_difference_minima = self.y_difference[self.minima_indices] + + + + # Step 5: Calculate thresholds + + self.Tmx = self.y_difference_maxima - ( + + self.S * np.abs(np.diff(self.x_normalized).mean()) + + ) + + + + # Step 6: find knee + + self.knee, self.norm_knee = self.find_knee() + + + + # Step 7: If we have a knee, extract data about it + + self.knee_y = self.norm_knee_y = None + + if self.knee: + + self.knee_y = self.y[self.x == self.knee][0] + + self.norm_knee_y = self.y_normalized[self.x_normalized == self.norm_knee][0] + + + + + + + + def set_filename_from_java(self,file): + + filename= file + + return filename + + + + @staticmethod + + def __normalize(a: Iterable[float]) -> Iterable[float]: + + """normalize an array + + :param a: The array to normalize + + """ + + return (a - min(a)) / (max(a) - min(a)) + + + + @staticmethod + + def transform_xy( + + x: Iterable[float], y: Iterable[float], direction: str, curve: str + + ) -> Tuple[Iterable[float], Iterable[float]]: + + """transform x and y to concave, increasing based on given direction and curve""" + + # convert elbows to knees + + if curve == "convex": + + x = x.max() - x + + y = y.max() - y + + # flip decreasing functions to increasing + + if direction == "decreasing": + + y = np.flip(y, axis=0) + + + + if curve == "convex": + + x = np.flip(x, axis=0) + + y = np.flip(y, axis=0) + + + + return x, y + + + + def find_knee(self,): + + """This function finds and sets the knee value and the normalized knee value. """ + + if not self.maxima_indices.size: + + warnings.warn( + + "No local maxima found in the difference curve\n" + + "The line is probably not polynomial, try plotting\n" + + "the difference curve with plt.plot(knee.x_difference, knee.y_difference)\n" + + "Also check that you aren't mistakenly setting the curve argument", + + RuntimeWarning, + + ) + + return None, None + + + + # placeholder for which threshold region i is located in. + + maxima_threshold_index = 0 + + minima_threshold_index = 0 + + # traverse the difference curve + + for i, x in enumerate(self.x_difference): + + # skip points on the curve before the the first local maxima + + if i < self.maxima_indices[0]: + + continue + + + + j = i + 1 + + + + # reached the end of the curve + + if x == 1.0: + + break + + + + # if we're at a local max, increment the maxima threshold index and continue + + if (self.maxima_indices == i).any(): + + threshold = self.Tmx[maxima_threshold_index] + + threshold_index = i + + maxima_threshold_index += 1 + + # values in difference curve are at or after a local minimum + + if (self.minima_indices == i).any(): + + threshold = 0.0 + + minima_threshold_index += 1 + + + + if self.y_difference[j] < threshold: + + if self.curve == "convex": + + if self.direction == "decreasing": + + knee = self.x[threshold_index] + + norm_knee = self.x_normalized[threshold_index] + + else: + + knee = self.x[-(threshold_index + 1)] + + norm_knee = self.x_normalized[-(threshold_index + 1)] + + + + elif self.curve == "concave": + + if self.direction == "decreasing": + + knee = self.x[-(threshold_index + 1)] + + norm_knee = self.x_normalized[-(threshold_index + 1)] + + else: + + knee = self.x[threshold_index] + + norm_knee = self.x_normalized[threshold_index] + + + + # add the y value at the knee + + y_at_knee = self.y[self.x == knee][0] + + y_norm_at_knee = self.y_normalized[self.x_normalized == norm_knee][0] + + if knee not in self.all_knees: + + self.all_knees_y.append(y_at_knee) + + self.all_norm_knees_y.append(y_norm_at_knee) + + + + # now add the knee + + self.all_knees.add(knee) + + self.all_norm_knees.add(norm_knee) + + + + # if detecting in offline mode, return the first knee found + + if self.online is False: + + return knee, norm_knee + + + + if self.all_knees == set(): + + warnings.warn("No knee/elbow found") + + return None, None + + + + return knee, norm_knee + + + + def plot_knee_normalized(self, figsize: Optional[Tuple[int, int]] = None): + + """Plot the normalized curve, the difference curve (x_difference, y_normalized) and the knee, if it exists. + + + + :param figsize: Optional[Tuple[int, int] + + The figure size of the plot. Example (12, 8) + + :return: NoReturn + + """ + + import matplotlib.pyplot as plt + + + + if figsize is None: + + figsize = (6, 6) + + + + plt.figure(figsize=figsize) + + plt.title("Normalized Knee Point") + + plt.plot(self.x_normalized, self.y_normalized, "b", label="normalized curve") + + plt.plot(self.x_difference, self.y_difference, "r", label="difference curve") + + plt.xticks( + + np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1) + + ) + + plt.yticks( + + np.arange(self.y_difference.min(), self.y_normalized.max() + 0.1, 0.1) + + ) + + + + plt.vlines( + + self.norm_knee, + + plt.ylim()[0], + + plt.ylim()[1], + + linestyles="--", + + label="knee/elbow", + + ) + + plt.legend(loc="best") + + + + def plot_knee(self, figsize: Optional[Tuple[int, int]] = None): + + """ + + Plot the curve and the knee, if it exists + + + + :param figsize: Optional[Tuple[int, int] + + The figure size of the plot. Example (12, 8) + + :return: NoReturn + + """ + + import matplotlib.pyplot as plt + + + + if figsize is None: + + figsize = (6, 6) + + + + plt.figure(figsize=figsize) + + plt.title("Knee Point") + + plt.plot(self.x, self.y, "b", label="data") + + plt.vlines( + + self.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow" + + ) + + plt.legend(loc="best") + + + + # Niceties for users working with elbows rather than knees + + @property + + def elbow(self): + + return self.knee + + + + @property + + def norm_elbow(self): + + return self.norm_knee + + + + @property + + def elbow_y(self): + + return self.knee_y + + + + @property + + def norm_elbow_y(self): + + return self.norm_knee_y + + + + @property + + def all_elbows(self): + + return self.all_knees + + + + @property + + def all_norm_elbows(self): + + return self.all_norm_knees + + + + @property + + def all_elbows_y(self): + + return self.all_knees_y + + + + @property + + def all_norm_elbows_y(self): + + return self.all_norm_knees_y + + + + + +## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + + + +#df=pd.read_excel("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/testdata.xlsx") + +#df=pd.read_excel("data.xlsx", sheet_name='har2', header=None, na_values=['NA'], usecols="Aq,at",skiprows=range(97),nrows=6) + + + +#nameoffile = my_test(sys.argv[1], sys.argv[2], sys.argv[3]) # this is for the java calling + +nameoffile = my_test("sam","25", "C:/Users/sayan/Documents/testdata/data.xlsx") # this is for the python test + + + +#nameoffile_1 = "C:/Users/sayan/Documents/testdata/data.xlsx" + +df=pd.read_excel(nameoffile, sheet_name='Sheet1', header=None, na_values=['NA']) + +print(df) + +conv_arr= df.values + + + + +#split matrix into 3 columns each into 1d array + +#print(conv_arr.shape) + +#print(conv_arr[1,1]) + +arr1 = np.delete(conv_arr,1,axis=1) + +arr2 = np.delete(conv_arr,0,axis=1) + + + +#converting into 1D array + +x = arr1.ravel() + +y = arr2.ravel() + + + +kn = KneeLocator(list(x), y , S=0.0, curve='convex', direction='decreasing',online=False ) #,interp_method='polynomial') + +#kn.set_filename_from_java("C:/Users/sayan/Documents/testdata/data.xlsx") + + + + + +kn2 = KneeLocator(list(x), y , S=1.0, curve='convex', direction='decreasing',online=False ) + +print(kn.knee) + +print(kn2.knee) + +print("success") + + + + +#print(kn.norm_knee) + + + +# plt.style.use('ggplot') + +# plt.plot() + +# plt.xlabel('K (no. of clusters) ') + +# plt.ylabel('WCSSE') + +# #plt.title('Elbow method for optimal k.[data=HAR, k=4, Pred. k= %d]' %(kn.knee)) + +# plt.suptitle('Elbow Method For Optimal Cluster Determination [data=HAR_4clus, K=4, Pred.K = %d]' %(kn.knee),x=0.5, y=0.000, ha="center" , va="bottom") + +# plt.plot(x, y, 'bx-') + +# #plt.xscale('log') + +# plt.grid(True) + +# plt.xticks() + +# plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') + +# plt.savefig("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/graphs/test1.pdf") + +# plt.show() + +# + +# plt.style.use('ggplot') + +# plt.plot() + +# plt.xlabel('Buckets') + +# plt.ylabel('Counts') + +# plt.title('Elbow method for optimal k. [data=NOISE_30_1, k=10, Pred. k= %d]' %(kn2.knee)) + +# plt.plot(x, y, 'bx-') + +# #plt.xscale('log') + +# plt.grid(True) + +# plt.xticks() + +# plt.vlines(kn2.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') + +# plt.savefig("C:/Users/deysn/OneDrive - University of Cincinnati/Documents/temp/run_results/3runs/graphs/test2.pdf") + +# plt.show() diff --git a/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java new file mode 100644 index 0000000..17646ff --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/Kneedle.java @@ -0,0 +1,170 @@ +package edu.uc.rphash.kneefinder; + + +import edu.uc.rphash.util.Maths; + +import java.util.ArrayList; + + +// to find the knee, taken from " https://github.com/lukehb/137-stopmove/blob/master/src/main/java/onethreeseven/stopmove/algorithm/Kneedle.java by Luke Bermingham " + +/** + * Given set of values look for the elbow/knee points. + * See paper: "Finding a Kneedle in a Haystack: Detecting Knee Points in System Behavior" + */ + + +public class Kneedle { + + /** + * Finds the indices of all local minimum or local maximum values. + * @param data The data to process + * @param findMinima If true find local minimums, else find local maximums. + * @return A list of the indices that have local minimum or maximum values. + */ + private ArrayList findCandidateIndices(double[][] data, boolean findMinima){ + ArrayList candidates = new ArrayList<>(); + //a coordinate is considered a candidate if both of its adjacent points have y-values + //that are greater or less (depending on whether we want local minima or local maxima) + for (int i = 1; i < data.length - 1; i++) { + double prev = data[i-1][1]; + double cur = data[i][1]; + double next = data[i+1][1]; + boolean isCandidate = (findMinima) ? (prev > cur && next > cur) : (prev < cur && next < cur); + if(isCandidate){ + candidates.add(i); + } + } + return candidates; + } + + + /** + * Find the index in the data the represents a most exaggerated elbow point. + * @param data the data to find an elbow in + * @return The index of the elbow point. + */ + private int findElbowIndex(double[] data){ + + int bestIdx = 0; + double bestScore = 0; + for (int i = 0; i < data.length; i++) { + double score = Math.abs(data[i]); + if(score > bestScore){ + bestScore = score; + bestIdx = i; + } + } + return bestIdx; + } + + /** + * Prepares the data by smoothing, then normalising into unit range 0-1, + * and finally, subtracting the y-value from the x-value. + * @param data The data to prepare. + * @param smoothingWindow Size of the smoothing window. + * @return The normalised data. + */ + private double[][] prepare(double[][] data, int smoothingWindow){ + + //smooth the data to make local minimum/maximum easier to find (this is Step 1 in the paper) + double[][] smoothedData = Maths.gaussianSmooth2d(data, smoothingWindow); + + //prepare the data into the unit range (step 2 of paper) + double[][] normalisedData = Maths.minmaxNormalise(smoothedData); + + //subtract normalised x from normalised y (this is step 3 in the paper) + for (int i = 0; i < normalisedData.length; i++) { + normalisedData[i][1] = normalisedData[i][1] - normalisedData[i][0]; + } + + return normalisedData; + } + + private double computeAverageVarianceX(double[][] data){ + double sumVariance = 0; + for (int i = 0; i < data.length - 1; i++) { + sumVariance += data[i + 1][0] - data[i][0]; + } + return sumVariance / (data.length - 1); + } + + /** + * Uses a heuristic to find what may be an elbow in the 1d data. + * This method is a heuristic so it may return in invalid elbow. + * If you need guarantees use the other method {@link Kneedle#run(double[][], double, int, boolean)} + * @param data The + * @return A possible elbow for this 1d data. + */ + public double findElbowQuick(double[] data){ + if(data.length <= 1){ + return 0; + } + + double[] normalisedData = Maths.minmaxNormalise1d(Maths.gaussianSmooth(data, 1)); + + //do kneedle y'-x' (in this case x' is normalised index value) + for (int i = 0; i < normalisedData.length; i++) { + double normalisedIndex = (double)i / data.length; + normalisedData[i] = normalisedData[i] - normalisedIndex; + } + + int elbowIdx = findElbowIndex(normalisedData); + return data[elbowIdx]; + } + + /** + * This algorithm finds the so-called elbow/knee in the data. + * See paper: "Finding a Kneedle in a Haystack: Detecting Knee Points in System Behavior" + * for more details. + * @param data The 2d data to find an elbow in. + * @param s How many "flat" points to require before we consider it a knee/elbow. + * @param smoothingWindow The data is smoothed using Gaussian kernel average smoother, this parameter is the window used for averaging + * (higher values mean more smoothing, try 3 to begin with). + * @param findElbows Whether to find elbows or knees. + * @return The elbow or knee values. + */ + public ArrayList run(double[][] data, double s, int smoothingWindow, boolean findElbows){ + + if(data.length == 0){ + throw new IllegalArgumentException("Cannot find elbow or knee points in empty data."); + } + if(data[0].length != 2){ + throw new IllegalArgumentException("Cannot run Kneedle, this method expects all data to be 2d."); + } + + ArrayList localMinMaxPts = new ArrayList<>(); + //do steps 1,2,3 of the paper in the prepare method + double[][] normalisedData = prepare(data, smoothingWindow); + //find candidate indices (this is step 4 in the paper) + { + ArrayList candidateIndices = findCandidateIndices(normalisedData, findElbows); + //go through each candidate index, i, and see if the indices after i are satisfy the threshold requirement + //(this is step 5 in the paper) + double step = computeAverageVarianceX(normalisedData); + step = findElbows ? step * s : step * -s; + + //check each candidate to see if it is a real elbow/knee + //(this is step 6 in the paper) + for (int i = 0; i < candidateIndices.size(); i++) { + Integer candidateIdx = candidateIndices.get(i); + Integer endIdx = (i + 1 < candidateIndices.size()) ? candidateIndices.get(i+1) : data.length; + + double threshold = normalisedData[candidateIdx][1] + step; + + for (int j = candidateIdx + 1; j < endIdx; j++) { + boolean isRealElbowOrKnee = (findElbows) ? + normalisedData[j][1] > threshold : normalisedData[j][1] < threshold; + if(isRealElbowOrKnee) { + localMinMaxPts.add(data[candidateIdx]); + break; + } + } + } + } + return localMinMaxPts; + } + + + +} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/kneefinder/PyScript.py b/src/main/java/edu/uc/rphash/kneefinder/PyScript.py new file mode 100644 index 0000000..32f0d0f --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/PyScript.py @@ -0,0 +1,13 @@ +#import JythonTest3 + +import sys + +class PyScript: + def __init__(self,txt): + city = [] + for i in range(0,len(sys.argv)): + city.append(str(sys.argv[i])) + print(city) +# jObj = JavaProg() +# jObj.getData(city) + print("Done") \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/kneefinder/findknee.java b/src/main/java/edu/uc/rphash/kneefinder/findknee.java new file mode 100644 index 0000000..fafa53e --- /dev/null +++ b/src/main/java/edu/uc/rphash/kneefinder/findknee.java @@ -0,0 +1,27 @@ +package edu.uc.rphash.kneefinder; + +import edu.uc.rphash.Centroid; +import edu.uc.rphash.Readers.RPHashObject; +import edu.uc.rphash.frequentItemSet.KHHCentroidCounter; +import edu.uc.rphash.lsh.LSH; +import java.util.ArrayList; + + +public class findknee implements Runnable { + + private float[] vec; + + + + @Override + public void run() { + // TODO Auto-generated method stub + + } + + +} + + + + diff --git a/src/main/java/edu/uc/rphash/projections/DBFriendlyProjection.java b/src/main/java/edu/uc/rphash/projections/DBFriendlyProjection.java index d88dc01..ab1c7b7 100644 --- a/src/main/java/edu/uc/rphash/projections/DBFriendlyProjection.java +++ b/src/main/java/edu/uc/rphash/projections/DBFriendlyProjection.java @@ -108,22 +108,40 @@ public float[] project(float[] v) { // -sqrt(3/t) // n: original dimension // t: target OR projected dimension + static float[] projectN(float[] v, int[][] P, int[][] M, int t) { float[] r = new float[t]; float sum; float scale = (float) Math.sqrt(3.0f / ((float) t)); for (int i = 0; i < t; i++) { sum = 0.0f; - for(int j=0;j>> 56); - s2[ct++] = (byte) (d >>> 48); - s2[ct++] = (byte) (d >>> 40); - s2[ct++] = (byte) (d >>> 32); - s2[ct++] = (byte) (d >>> 24); - s2[ct++] = (byte) (d >>> 16); - s2[ct++] = (byte) (d >>> 8); - s2[ct++] = (byte) (d); - - return computeCWowIntHash(s2, 0) % tablesize; - } - - @Override - public long hash(long[] s) { - byte[] s2 = new byte[s.length * 8]; - int ct = 0; - for (long d : s) { - s2[ct++] = (byte) (d >>> 56); - s2[ct++] = (byte) (d >>> 48); - s2[ct++] = (byte) (d >>> 40); - s2[ct++] = (byte) (d >>> 32); - s2[ct++] = (byte) (d >>> 24); - s2[ct++] = (byte) (d >>> 16); - s2[ct++] = (byte) (d >>> 8); - s2[ct++] = (byte) (d); - } - return computeCWowIntHash(s2, 0) % tablesize; - } - - public final static int CWOW_32_M = 0x57559429; - public final static int CWOW_32_N = 0x5052acdb; - public static final long LONG_LO_MASK = 0x00000000FFFFFFFFL; - - /** gather an int from the specified index into the byte array */ - public static final int gatherIntLE(byte[] data, int index) { - int i = data[index] & 0xFF; - i |= (data[++index] & 0xFF) << 8; - i |= (data[++index] & 0xFF) << 16; - i |= (data[++index] << 24); - return i; - } - - public static final int gatherPartialIntLE(byte[] data, int index, - int available) { - int i = data[index] & 0xFF; - if (available > 1) { - i |= (data[++index] & 0xFF) << 8; - if (available > 2) { - i |= (data[++index] & 0xFF) << 16; - } - } - return i; - } - - public int computeCWowIntHash(byte[] data, int seed) { - final int length = data.length; - /* cwfold( a, b, lo, hi ): */ - /* p = (u32)(a) * (u64)(b); lo ^=(u32)p; hi ^= (u32)(p >> 32) */ - /* cwmixa( in ): cwfold( in, m, k, h ) */ - /* cwmixb( in ): cwfold( in, n, h, k ) */ - int hVal = seed; - int k = length + seed + CWOW_32_N; - long p = 0; - int pos = 0; - int len = length; - while (len >= 8) { - int i1 = gatherIntLE(data, pos); - int i2 = gatherIntLE(data, pos + 4); - /* cwmixb(i1) = cwfold( i1, N, hVal, k ) */ - p = i1 * (long) CWOW_32_N; - k ^= p & LONG_LO_MASK; - hVal ^= (p >> 32); - /* cwmixa(i2) = cwfold( i2, M, k, hVal ) */ - p = i2 * (long) CWOW_32_M; - hVal ^= p & LONG_LO_MASK; - k ^= (p >> 32); - pos += 8; - len -= 8; - } - if (len >= 4) { - int i1 = gatherIntLE(data, pos); - /* cwmixb(i1) = cwfold( i1, N, hVal, k ) */ - p = i1 * (long) CWOW_32_N; - k ^= p & LONG_LO_MASK; - hVal ^= (p >> 32); - pos += 4; - len -= 4; - } - if (len > 0) { - int i1 = gatherPartialIntLE(data, pos, len); - /* cwmixb(i1) = cwfold( i1, N, hVal, k ) */ - p = (i1 & ((1 << (len * 8)) - 1)) * (long) CWOW_32_M; - hVal ^= p & LONG_LO_MASK; - k ^= (p >> 32); - } - p = (hVal ^ (k + CWOW_32_N)) * (long) CWOW_32_N; - k ^= p & LONG_LO_MASK; - hVal ^= (p >> 32); - hVal ^= k; - return hVal; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/ScalabilityTest.java b/src/main/java/edu/uc/rphash/tests/ScalabilityTest.java deleted file mode 100644 index b73826d..0000000 --- a/src/main/java/edu/uc/rphash/tests/ScalabilityTest.java +++ /dev/null @@ -1,130 +0,0 @@ -package edu.uc.rphash.tests; - -import java.util.ArrayList; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.Future; - -import edu.uc.rphash.RPHashAdaptive2PassParallel; -import edu.uc.rphash.RPHashSimpleParallel; -import edu.uc.rphash.RPHashStream; -import edu.uc.rphash.concurrent.VectorLevelConcurrency; -import edu.uc.rphash.tests.generators.GenerateStreamData; - -public class ScalabilityTest { - - public static long rphashstream(ArrayList vecsAndNoiseInThisRound, - int i, int k, GenerateStreamData gen1) { - RPHashStream rphit = new RPHashStream(k, gen1, i); - long timestart = System.nanoTime(); - //vecsAndNoiseInThisRound.parallelStream().map(vec-> - // VectorLevelConcurrency.computeSequential(vec, rphit.lshfuncs.get(0), rphit.is.get(0), rphit.getParam())); - for (float[] v : vecsAndNoiseInThisRound) - rphit.addVectorOnlineStep(v); - rphit.getCentroidsOfflineStep(); - - return System.nanoTime() - timestart; - } - - public static long rphashsimple(ArrayList vecsAndNoiseInThisRound, - int i, int k) { - RPHashSimpleParallel rphit = new RPHashSimpleParallel( - vecsAndNoiseInThisRound, k, i); - - long timestart = System.nanoTime(); - rphit.mapreduce1(); - rphit.mapreduce2(); - return System.nanoTime() - timestart; - } - - public static long rphashadaptive( - ArrayList vecsAndNoiseInThisRound, int i, int k) { - - RPHashAdaptive2PassParallel rphit = new RPHashAdaptive2PassParallel( - vecsAndNoiseInThisRound, k, i); - - long timestart = System.nanoTime(); - rphit.run(); - return System.nanoTime() - timestart; - } - - public static void scalability(int n) { - int k = 10; - int d = 1000; - float var = 1f; - Runtime rt = Runtime.getRuntime(); - // Random r = new Random(); - int NUM_Procs = rt.availableProcessors(); - - GenerateStreamData gen1 = new GenerateStreamData(k, d, var, 11331313); - - ArrayList vecsAndNoiseInThisRound = new ArrayList(n); - - // generate data in parallel - vecsAndNoiseInThisRound = gen1.genParallel(n); - - System.out.println(vecsAndNoiseInThisRound.size()); - System.out.printf("Threads\tSimple\tStream\tAdaptive\n"); - - long timesimple = 0, timeadaptive = 0, timestream = 0; - - for (int i = 1; i <= NUM_Procs; i++) { - - try { - //mix up the order - if(i%3==0){ - System.gc(); - Thread.sleep(1000); - timesimple = rphashsimple(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timeadaptive = rphashadaptive(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timestream = rphashstream(vecsAndNoiseInThisRound, i, k, gen1); - } - - if(i%3==1){ - System.gc(); - Thread.sleep(1000); - timeadaptive = rphashadaptive(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timesimple = rphashsimple(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timestream = rphashstream(vecsAndNoiseInThisRound, i, k, gen1); - } - - if(i%3==2){ - System.gc(); - Thread.sleep(1000); - timesimple = rphashsimple(vecsAndNoiseInThisRound, i, k); - System.gc(); - Thread.sleep(1000); - timestream = rphashstream(vecsAndNoiseInThisRound, i, k, gen1); - System.gc(); - Thread.sleep(1000); - timeadaptive = rphashadaptive(vecsAndNoiseInThisRound, i, k); - } - - System.out.printf("%d\t%.6f\t%.6f\t%.6f\n", i, - timesimple / 1e9f, timestream / 1e9f, - timeadaptive / 1e9f); - - } catch (Exception e) { - e.printStackTrace(); - System.out.println("Exception at Proc:" + String.valueOf(i)); - System.out.printf("%d\t%.6f\t%.6f\t%.6f\n", i, - timesimple / 1e9f, timestream / 1e9f, - timeadaptive / 1e9f); - - } - } - } - - public static void main(String[] args) throws InterruptedException { - ScalabilityTest.scalability(Integer.parseInt(args[0])); - - } -} diff --git a/src/main/java/edu/uc/rphash/tests/StatTests.java b/src/main/java/edu/uc/rphash/tests/StatTests.java index 6b7a927..e63a336 100644 --- a/src/main/java/edu/uc/rphash/tests/StatTests.java +++ b/src/main/java/edu/uc/rphash/tests/StatTests.java @@ -110,7 +110,8 @@ public static double WCSSECentroidsFloat(List estCentroids, List M = ( new LloydIterativeKmeans(k,gen.data(),projdim)).getCentroids(); + long duration = (System.nanoTime() - startTime); // List aligned = VectorUtil.alignCentroids(M,gen.medoids()); diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/AdaptiveMeanShift.java b/src/main/java/edu/uc/rphash/tests/clusterers/AdaptiveMeanShift.java deleted file mode 100644 index 122dac3..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/AdaptiveMeanShift.java +++ /dev/null @@ -1,432 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.kdtree.KDTreeNN; -import edu.uc.rphash.kdtree.naiveNN; -import edu.uc.rphash.lsh.LSHkNN; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -/* Adaptive Mean Shift (AMS) Algorithm - * - * - * Mean Shift algorithm based on methods described by Fukunaga and Hostetler - * 'Estimation of the Gradient of a Density Function, with Applications - * in Pattern Recognition' ( - * - * http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1055330 - * - * - * - * Additional Kernel and Optimizations described by Cheng - * 'Mean Shift, Mode Seeking, and Clustering' (1995) - * - * http://dl.acm.org/citation.cfm?id=628711 - * - * - * - * Adaptive Mean Shift algorithm based on ... - * - * - */ - - -//TODO: Add labels to points for centroids -//TODO: add weights to centroid merging -> rphash (cardinality) -//TODO: windowMode -> Sample Point Estimator - -final class cStore{ - public int count; - public float[] centroid; - public float wcsse = 0; - public Centroid cent; - - public void addPoint(float[] point){ - this.count++; - this.wcsse += VectorUtil.distance(point, centroid); - this.cent.setCount(this.count); - //TODO: this.cent.setWCSS(this.wcsse); - } - - public cStore(float[] centroid){ - this.count = 0; - this.cent = new Centroid(centroid,0); - this.centroid = centroid; - this.wcsse = 0; - } - - public cStore(float[] window, float[] point) { - // TODO Auto-generated constructor stub - this.count = 0; - this.cent = new Centroid(window,0); - this.centroid = window; - this.wcsse = VectorUtil.distance(point, window); - } - -} - - -public class AdaptiveMeanShift implements Clusterer { - - List data; //global data storage - List centroids; //global centroid storage - private RPHashObject so; - private List cs; - - //Parameters - double h = 1; // bandwidth - - int kernelMode = 0; // mode (0:uniform; 1:gaussian) - - int windowMode = 1; // Determine how to perform the Adaptive Window - // 0 - No adaptivity; Basic Mean Shift - // 1 - Balloon Estimator - // 2 - Sample Point Estimator (TODO) - - int knnAlg = 2; //Determine what KNN algorithm to use - // 0 - kNN Naive - // 1 - kNN LSH - // 2 - KD-TREE kNN - - int k = 5; //Number of KNN points for adaptive window - - Clusterer weightClusters = null; - - - static int maxiters = 10000; //Max iterations before breaking search for convergence - float convergeValue = (float) 0.00001; //maximum change in each dimension to 'converge' - float blurPercent = (float) 2; //Amount to blur centroids to group similar Floats - - //TEST Parameters: - boolean debug = false; //Control Debug Output - boolean minimalOutput = true; //Print the minimal final output (pretty print) - boolean printCentroids = true; //Print out centroids (not pretty) - Set cent = new HashSet(); //Storage for grouping the clusters - - public void setMode(int mode){ this.kernelMode = mode; } - - public void setH(double h) { this.h = h; } - - public void setWinMode(int winMode){ this.windowMode = winMode; } - - public List getData() { return data; } - - public void setRawData(List data){ this.data = data; } - - - public AdaptiveMeanShift(){ - this.centroids = new ArrayList(); - this.cs = new ArrayList(); - } - - public AdaptiveMeanShift(int k, List data){ - this.k = k; - this.data = data; - this.centroids = new ArrayList(); - } - - public AdaptiveMeanShift(int h, int windowMode, int kernelMode, int k, List data){ - this.h = h; - this.windowMode = windowMode; - this.kernelMode = kernelMode; - this.k = k; - this.data = data; - this.centroids = new ArrayList(); - this.cs = new ArrayList(); - } - - public AdaptiveMeanShift(int h, int windowMode, int kernelMode, int k, List data, Clusterer c){ - this.h = h; - this.weightClusters = c; - this.windowMode = windowMode; - this.kernelMode = kernelMode; - this.k = k; - this.data = data; - this.centroids = new ArrayList(); - this.cs = new ArrayList(); - } - - public float calcMode(float curWindow, float workingData){ - float mPoint = 0; - float kern = 0; - - if (kernelMode == 0) //Uniform - mPoint = workingData; - else if (kernelMode == 1){ //Gaussian - float c = (float) (1.0/Math.pow(h,2)); - kern = (float) Math.exp(-c * Math.pow(workingData - curWindow, 2)); - mPoint = (float) kern * (workingData - curWindow); - } - - return mPoint; - } - - public void adaptH(List data, int curPoint, LSHkNN knnHandle, KDTreeNN kdHandle, naiveNN naiveHandle){ - if(windowMode == 0) //No adaptivity - return; - else if(windowMode == 1){ //Balloon - if(knnAlg == 0){ - h = Math.sqrt(naiveHandle.getNNEuc(k, data.get(curPoint))); - printDebug("naiveH: " + h); - } - if(knnAlg == 1){ - List retData = knnHandle.knn(k, data.get(curPoint)); - h = VectorUtil.distance(retData.get(retData.size() - 1),data.get(curPoint)); - printDebug("LSHH: " + h); - } - if(knnAlg == 2){ - h = Math.sqrt(kdHandle.treeNNEuc(k, data.get(curPoint))); - printDebug("KDH: " + h + "\n"); - } - - return; - } - else if(windowMode == 2){ //KNN sample point estimator - return; - } - } - - - - public void cluster(List data){ - LSHkNN knnHandle = null; - KDTreeNN kdHandle = null; - naiveNN naiveHandle = null; - if(windowMode == 1){ - if(knnAlg == 0){ - naiveHandle = new naiveNN(data); - } - if(knnAlg == 1){ - knnHandle = new LSHkNN(data.get(0).length,5); - knnHandle.createDB(data); - } - if(knnAlg == 2){ - kdHandle = new KDTreeNN(); - kdHandle.createTree(data); - } - } - - - for(int i = 0; i < data.size(); i++){ - - float[] curWindow = new float[data.get(0).length]; - float[] bufWindow = new float[data.get(0).length]; - boolean converge = false; - int m = 0; - int winCount = 0; - - for(int t = 0; t < data.get(0).length; t++){ - curWindow = data.get(i).clone(); - } - - adaptH(data, i, knnHandle, kdHandle, naiveHandle); - - while((!converge) && (m < maxiters)){ - m++; - bufWindow = curWindow.clone(); - - for(int t = 0; t < data.get(0).length; t++){ - curWindow[t] = (float) 0; - } - - for(int x = 0; x < data.size(); x++){ - - if(VectorUtil.distance(bufWindow, data.get(x)) <= h){ - winCount++; - - for(int n = 0; n < data.get(x).length; n++){ - curWindow[n] = curWindow[n] + calcMode(bufWindow[n], data.get(x)[n]); - } - } - } - - if(winCount > 0){ - boolean convergeTest = true; - - for(int y = 0; y < curWindow.length; y++){ - if(curWindow[y] >= convergeValue) - convergeTest = false; - } - - if(kernelMode == 0){ - for(int y = 0; y < curWindow.length; y++){ - curWindow[y] = curWindow[y] / winCount; - } - } - if(kernelMode >= 1){ - for(int y = 0; y < curWindow.length; y++){ - curWindow[y] = curWindow[y] / winCount; - curWindow[y] = bufWindow[y] + curWindow[y]; - printDebug("New Window: " + curWindow[y]); - } - printDebug("_______________________________________"); - } - - - //Check for convergence - if(Arrays.equals(curWindow,bufWindow) || convergeTest){ - boolean add = true; - if(centroids.indexOf(curWindow) >= 0){ - add = false; - } - add = checkAllCentroids(curWindow, data.get(i)); - - if(add){ - String str = ""; - for(int j = 0; j < curWindow.length; j++){str += Float.toString(curWindow[j]) + ",";} - cent.add(str + "\n"); - } - - converge = true; - } - bufWindow = curWindow.clone(); - } - - m = 0; - winCount = 0; - } - } - - for(cStore cen: cs){ - Centroid it = new Centroid(cen.centroid, 0); - it.setCount(cen.count); - //TODO: it.setWCSS(cen.wcsse); - centroids.add(it); - - } - } - - - public boolean checkAllCentroids(float[] window, float[] point){ - float[] centroid; - for(cStore cz : cs){ - centroid = cz.centroid; - double percentDiff = 0; - - for(int z = 0; z < centroid.length; z++){ - percentDiff = percentDiff + Math.abs(1-(centroid[z] / window[z])); - } - - percentDiff = percentDiff / centroid.length; - - if(percentDiff < blurPercent){ - cz.addPoint(point); - return false; - } - - } - - cs.add(new cStore(window, point)); - return true; - } - - - void run(){ - if(this.weightClusters != null){ - //this.weightClusters.setData(this.data); - } - - cluster(this.data); - } - - - public void printDebug(String s){ - if(debug) - System.out.println(s); - } - - - public static void main(String[] args){ - int genClusters = 3; - int genRowsPerCluster =100; - int genColumns = 100; - - AdaptiveMeanShift ams = new AdaptiveMeanShift(); - - if(ams.data == null){ - GenerateData gen = new GenerateData(genClusters,genRowsPerCluster, genColumns); - ams.data = gen.data; - } - - ams.run(); - if(ams.printCentroids){ - System.out.println("Centroid Count: " + ams.centroids.size()); - for(Centroid c: ams.centroids){ - System.out.println("WCSS = " + c.getWCSS()); - System.out.print("Cent = "); - for(int z = 0; z < c.centroid().length; z++) - System.out.print(c.centroid()[z] + ","); - System.out.println("\n\n"); - } - } - if(ams.minimalOutput){ - System.out.println("\n\nh: " + ams.h); - System.out.println("Kernel Mode: " + ams.kernelMode); - System.out.println("Window Mode: " + ams.windowMode); - System.out.println("k (KNN): " + ams.k + "\n"); - System.out.println("Number of Clusters: " + ams.cent.size() + "\n"); - System.out.println(ams.cent.toString().replaceAll(", ", " ")); - } - - System.out.println("\n\nDone!"); - } - - - @Override - public List getCentroids() { - if(this.centroids.size() == 0) - run(); - return this.centroids; - } - - @Override - public RPHashObject getParam() { - so = new SimpleArrayReader(this.data, k); - return so; - } - - @Override - public void setK(int getk) { - this.k = getk; - } - - @Override - public void setWeights(List counts) { - // TODO Auto-generated method stub - if(data != null) { - - } - - - } - - @Override - public void setData(List centroids) { - ArrayList data = new ArrayList(centroids.size()); - for(Centroid c : centroids)data.add(c.centroid()); - setRawData(data); - } - - @Override - public void reset(int randomseed) { - // TODO Auto-generated method stub - this.centroids = null; - - } - - @Override - public boolean setMultiRun(int runs) { - // Return true to ignore multi-run (deterministic) - return true; - } - -} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative.java b/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative.java deleted file mode 100644 index bedf99d..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative.java +++ /dev/null @@ -1,151 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.List; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -public class Agglomerative implements Clusterer{ - - int k; - List clusters; - List data; - float[][] distances; - List counts; - public Agglomerative() - { - - } - public Agglomerative(int k, List data) - { - this.k = k; - this.data = data; - this.clusters = null; - counts = new ArrayList(); - for(int i = 0;i data){ - float[][] distances = new float[data.size()][data.size()]; - - for(int i = 0 ; i < data.size();i++) - { - for(int j = 0; j < data.size();j++) - distances[i][j] =VectorUtil.distance(data.get(i), data.get(j)); - } - return distances; - } - - - private float[] avgVector(float[] u, float[] v, Float float1, Float float2){ - float[] w = new float[u.length]; - for(int i = 0 ;i < u.length;i++)w[i] = (u[i]*float1+v[i]*float2)/(float1+float2); - return w; - } - - private void merge() - { - float minimum = 1000000f; - int mini = 0; - int minj = 0; - int i = 0 ; - for(float[] l : distances) - { - for(int j = 0; j < data.size();j++){ - if(l[j]k) - merge(); - } - - public static void main(String[] args){ - GenerateData gen = new GenerateData(3,500,2); - List data =gen.data; - float[][] dists = distanceArray(data); -// double[] weights = new double[data.size()]; - - - String[] s = new String[dists.length]; - for(int i = 0;i< dists.length;i++)s[i] = String.valueOf(i); - - Agglomerative agl = new Agglomerative(3, data); - agl.run(); - for(float[] cent: gen.getMedoids()){ - for(float f : cent)System.out.print(f+" "); - System.out.println(); - } - System.out.println("computed"); - - for(Centroid cent: agl.getCentroids()){ - for(float f : cent.centroid())System.out.print(f+" "); - System.out.println(); - } - - } - - @Override - public List getCentroids() { - if(clusters==null)run(); - List cents = new ArrayList<>(clusters.size()); - for(float[] v : this.clusters)cents.add(new Centroid(v,0)); - return cents; - } - - @Override - public void reset(int randomseed) { - clusters = null; - } - - - @Override - public RPHashObject getParam() { - // TODO Auto-generated method stub - return null; - } - - @Override - public void setWeights(List counts) { - //this.counts = counts; - counts = new ArrayList(); - } - @Override - public void setK(int getk) { - this.k = getk; - } - @Override - public void setData(List centroids) { - this.data = new ArrayList(centroids.size()); - for(Centroid c : centroids) data.add(c.centroid()); - } - @Override - public void setRawData(List centroids) { - this.data = centroids; - } - - @Override - public boolean setMultiRun(int runs) { - //agglomerative is deterministic running multiple times is moot - return true; - } -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative2.java b/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative2.java deleted file mode 100644 index ad5cd4a..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/Agglomerative2.java +++ /dev/null @@ -1,372 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.TreeSet; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -public class Agglomerative2 implements Clusterer { - private class DistAndVector implements Comparable { - Float dist; - Integer vec; - - @Override - public int compareTo(DistAndVector o) { - // if (equals(o)) - // return 0; - if (dist.floatValue() == o.dist.floatValue()) - return 1; - if (dist.floatValue() < o.dist.floatValue()) - return -1; - return 1; - - } - } - - private class PQAndVector implements Comparable { - TreeSet pq; - Integer vec; - - PQAndVector(Integer vec) { - this.pq = new TreeSet(); - this.vec = vec; - } - - // @Override - // public boolean equals(Object o) { - // - // return ((PQAndVector) o).vec.intValue() == vec.intValue(); - // } - - @Override - public int compareTo(PQAndVector r) { - - // if (equals(r)){ - // System.out.println("whatthehell"+ r.vec.intValue() + ":"+ - // this.vec.intValue()); - // return 0; - // - // } - if (pq.isEmpty()) { - return 1; - } - if (r.pq.isEmpty()) { - return -1; - } - - if (pq.first().dist == r.pq.first().dist) { - return 1; - } - return pq.first().compareTo(r.pq.first()); - } - } - - int k; - TreeSet outerpq = new TreeSet(); - List data; - float counts[]; - - private void distanceArray(List data) { - int n = data.size(); - for (int i = 0; i < n - 1; i++) { - PQAndVector innerpq = new PQAndVector(new Integer(i)); - - for (int j = i + 1; j < n; j++) { - DistAndVector dv = new DistAndVector(); - dv.dist = new Float(VectorUtil.distance(data.get(i), - data.get(j))); - dv.vec = new Integer(j); - innerpq.pq.add(dv); - } - // - // System.out.print(i+" : "); - // for(Object p: - // innerpq.pq.toArray())System.out.print(((DistAndVector)p).vec+", ");System.out.println(); - outerpq.add(innerpq); - } - // for(PQAndVector p: - // outerpq)System.out.print(p.vec+", ");System.out.println(); - - } - - private void distanceArray2(List data2,List projIDs) { - int n = data.size(); - for (int i = 0; i < n - 1; i++) { - PQAndVector innerpq = new PQAndVector(new Integer(i)); - - for (int j = i + 1; j < n; j++) { - DistAndVector dv = new DistAndVector(); - if(projIDs.get(i).equals(projIDs.get(j))){ - dv.dist = Float.MAX_VALUE; - } - else{ - dv.dist = new Float(VectorUtil.distance(data.get(i), - data.get(j))); - } - - - dv.vec = new Integer(j); - innerpq.pq.add(dv); - } - outerpq.add(innerpq); - } - } - - private void mergeAndUpdateCentroids(int newdata, int olddata) - { - float[] u = data.get(newdata); - float[] v = data.get(olddata); - float ct1 = counts[newdata]; - float ct2 = counts[olddata]; - float[] w = new float[u.length]; - for (int i = 0; i < u.length; i++) - w[i] = (u[i] * ct1 + v[i] * ct2) / (ct1 + ct2); - counts[newdata] += counts[olddata]; - data.set(newdata, w); - - } - - /** - * remove the next two nearest vectors and perform a counts weighted average - * of the vectors. put this vector in the lower of the two vector indeces. - */ - private void merge() { - // pop the queue with the nearest top vector in it - - PQAndVector innerpq = outerpq.pollFirst(); - //lower id lists are not checked for removals, check here. - while (innerpq.pq.isEmpty()) { - innerpq = outerpq.pollFirst(); - } - // pop the nearest vector - DistAndVector dv = innerpq.pq.pollFirst(); - - int newvecloc = innerpq.vec; - int olddata = dv.vec; - - Iterator it = outerpq.iterator(); - while (it.hasNext()) { - PQAndVector v = it.next(); - if (v.vec.intValue() == olddata) { - - it.remove(); - break; - } - } - - // merge the two vectors - mergeAndUpdateCentroids(newvecloc, olddata); - - PQAndVector newpq = new PQAndVector(newvecloc); - - Iterator pqit = outerpq.iterator(); - while (pqit.hasNext()) { - PQAndVector itpq = pqit.next(); - - // remove the merged vectors from all inner lists - Iterator itdv = itpq.pq.iterator(); - while (itdv.hasNext()) { - DistAndVector v = itdv.next(); - if (v.vec.intValue() == newvecloc) { - itdv.remove(); - break; - } - } - - itdv = itpq.pq.iterator(); - while (itdv.hasNext()) { - DistAndVector v = itdv.next(); - if ( v.vec.intValue() == olddata) { - itdv.remove(); - break; - } - } - } - - // System.out.println("lists after removal"); - // printlists(); - - pqit = outerpq.iterator(); - while (pqit.hasNext()) { - PQAndVector itpq = pqit.next(); - // add the new vector distance to all upper parent lists - // compute new distance to vecs who have this vector in their list - if (itpq.vec < newvecloc) { - DistAndVector dv3 = new DistAndVector(); - dv3.dist = new Float(VectorUtil.distance(data.get(newvecloc), - data.get(itpq.vec))); - - dv3.vec = new Integer(newvecloc); - - // add the updated vector to the new lists - itpq.pq.add(dv3); - } - } - - // System.out.println("lists after adding back into lower idx lists"); - // printlists(); - - pqit = outerpq.iterator(); - while (pqit.hasNext()) { - PQAndVector itpq = pqit.next(); - // add to the new merge list - if (itpq.vec > newvecloc) { - DistAndVector dv2 = new DistAndVector(); - dv2.dist = new Float(VectorUtil.distance(data.get(newvecloc), - data.get(itpq.vec))); - dv2.vec = new Integer(itpq.vec); - newpq.pq.add(dv2); - } - } - - outerpq.add(newpq); - - // System.out.println("adding merged list back"); - // printlists(); - - } - - private void printlists() { - System.out.println(); - for (Object o : outerpq.toArray()) { - System.out.print("\t" + ((PQAndVector) o).vec + " : "); - for (Object p : ((PQAndVector) o).pq.toArray()) { - System.out.print(((DistAndVector) p).vec + ", "); - } - System.out.println(); - } - } - - private void run() { - while (outerpq.size() > k) { - - merge(); - } - Iterator pqit = outerpq.iterator(); - centroids = new ArrayList(); - while (pqit.hasNext()) { - PQAndVector innerpq = pqit.next(); - centroids.add(data.get(innerpq.vec)); - } - } - - public static void main(String[] args) { - - for (int i = 0; i < 1000; i += 10) { - long avgtime = 0; - float avgdistagg = 0; - float avgdistreal = 0; - float avgdistkm = 0; - if(i!=0){ - for (int j = 0; j < 5; j++) { - GenerateData gen = new GenerateData(10, i, 10, .5f); - List data = gen.data; - - long timestart = System.currentTimeMillis(); - Clusterer km1 = new LloydIterativeKmeans(10, data); - Clusterer ag1 = new Agglomerative2(10, data); - avgdistagg+=StatTests.WCSSECentroidsFloat(ag1.getCentroids(), data); - avgdistkm+=StatTests.WCSSECentroidsFloat(km1.getCentroids(), data); - avgdistreal+=StatTests.WCSSE(gen.getMedoids(), data); - avgtime += (System.currentTimeMillis() - timestart); - } - } - System.out.println(i + "\t" + avgtime / 5+"\t"+avgdistagg/5f+"\t"+avgdistkm/5f+"\t"+avgdistreal/5f); - } - - } - - List centroids; - - @Override - public List getCentroids() { - if(centroids==null)run(); - List cents = new ArrayList<>(centroids.size()); - for(float[] v : this.centroids)cents.add(new Centroid(v,0)); - return cents; - } - - @Override - public void reset(int randomseed) { - centroids = null; - } - - @Override - public RPHashObject getParam() { - return null; - } - - public void printDistanceArray() { - for (int i = 0; i < data.size(); i++) { - for (int j = 0; j < data.size(); j++) - System.out.printf("%.2f,", - VectorUtil.distance(data.get(i), data.get(j))); - System.out.println(); - } - System.out.println(); - } - - public Agglomerative2(int k, List data) { - this.k = k; - this.data = data; - this.counts = new float[data.size()]; - for (int i = 0; i < counts.length; i++) - counts[i] = 1; - - distanceArray(data); - - } - - public Agglomerative2(int k, List data, List counts) { - this.k = k; - this.data = data; - this.counts = new float[counts.size()]; - for (int i = 0; i < counts.size(); i++) - this.counts[i] = counts.get(i); - distanceArray(data); - } - - public Agglomerative2(int k, List data, List counts,List projectionIDs) { - this.k = k; - this.data = data; - this.counts = new float[counts.size()]; - for (int i = 0; i < counts.size(); i++) - this.counts[i] = counts.get(i); - distanceArray2(data,projectionIDs); - } - - @Override - public void setWeights(List counts) { - - } - - @Override - public void setData(List centroids) { - this.data = new ArrayList(centroids.size()); - for(Centroid c : centroids) data.add(c.centroid()); - } - @Override - public void setRawData(List centroids) { - this.data = centroids; - } - - @Override - public void setK(int getk) { - this.k = getk; - } - - @Override - public boolean setMultiRun(int runs) { - //agglomerative is deterministic running multiple times is moot - return true; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java b/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java index 085bb30..e2617da 100644 --- a/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java +++ b/src/main/java/edu/uc/rphash/tests/clusterers/DBScan.java @@ -35,16 +35,15 @@ public DBScan() { } - /* public DBScan(List , double eps , int minPoints) { + public DBScan(List data , double eps , int minPoints) { this.setRawData(data); - this.setEps(eps); - this.setminpoints(minPoints); - + this.eps = eps; + this.minPoints = minPoints; } - */ + public DBScan(List data ) { @@ -55,8 +54,8 @@ public DBScan(List data ) { public List getCentroids() { // to be completed - double eps = 0.35; - int minPoints = 5; + //double eps = 6; + //int minPoints = 4; DBSCANClusterer db = new DBSCANClusterer(eps , minPoints ); @@ -85,7 +84,10 @@ public List getCentroids() { // to be completed } C.add(new Centroid(floatArray, 0)); // setting the projection id = 0 } + return C; + + } // abstract RPHashObject getParam(); @@ -162,13 +164,20 @@ public boolean setMultiRun(int runs) { public static void main(String[] args) { - GenerateData gen = new GenerateData(3, 1000, 5); // the data generator of rhpash + GenerateData gen = new GenerateData(20,500,5); // the data generator of rhpash + - DBScan db = new DBScan (gen.data ); + DBScan db = new DBScan (gen.data, 1 , 2 ); + + System.out.println("minpoints = "+ (db.minPoints)); + System.out.println("eps = "+ (db.eps)); + System.out.println("number of centroids = "+ (db.getCentroids()).size()); for (Centroid iter : db.getCentroids()) { // output centroids float[] toprint = iter.centroid(); + System.out.println("333333333333333"); + System.out.println(Arrays.toString(toprint)); } diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/KMeans2.java b/src/main/java/edu/uc/rphash/tests/clusterers/KMeans2.java index e0a899b..38651f7 100644 --- a/src/main/java/edu/uc/rphash/tests/clusterers/KMeans2.java +++ b/src/main/java/edu/uc/rphash/tests/clusterers/KMeans2.java @@ -1,7 +1,7 @@ package edu.uc.rphash.tests.clusterers; import java.util.ArrayList; -import java.util.Arrays; + import java.util.List; import java.util.Random; @@ -10,7 +10,7 @@ import edu.uc.rphash.Readers.RPHashObject; import edu.uc.rphash.tests.StatTests; import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; + public class KMeans2 implements Clusterer { diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/Kmeans.java b/src/main/java/edu/uc/rphash/tests/clusterers/Kmeans.java deleted file mode 100644 index 2f92ac1..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/Kmeans.java +++ /dev/null @@ -1,330 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.projections.DBFriendlyProjection; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -import org.apache.commons.lang3.ArrayUtils; -//import org.rosuda.JRI.REXP; -//import org.rosuda.JRI.Rengine; - -public class Kmeans implements Clusterer { - int k; - int n; - List data; - public int getK() { - return k; - } - - public void setK(int k) { - this.k = k; - } - - public List getData() { - return data; - } - - @Override - public void setData(List centroids) { - this.data = new ArrayList(centroids.size()); - for(Centroid c : centroids) data.add(c.centroid()); - } - @Override - public void setRawData(List centroids) { - this.data = centroids; - } - - public List getWeights() { - return weights; - } - - public void setWeights(List weights) { - this.weights = weights; - } - - int projdim; - -// List means; -// List kmeansCentroids = new ArrayList(); - List> clusters; - List weights; -// Rengine re; -// -// public void setRengine(Rengine re) { -// this.re = re; -// } - - public Kmeans(int k, List data) { - this.k = k; - this.data = data; - this.projdim = 0; - this.clusters = null; - this.weights = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - weights.add(1f); -// means = null; - } - - public Kmeans(int k, List data, List weights) { - this.k = k; - this.data = data; - this.projdim = 0; - this.clusters = null; - this.weights = weights; -// means = null; - } - - public Kmeans(int k, List data, int projdim) { - this.k = k; - this.data = data; - this.projdim = projdim; - this.clusters = null; - this.weights = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - weights.add(1f); -// means = null; - } - -// public Kmeans(Rengine re) { -// // TODO Auto-generated constructor stub -// this.re = re; -// } - - /* - public float[] computerCentroid(List vectors, List data) { - int d = data.get(0).length; - float[] centroid = new float[d]; - - for (int i = 0; i < d; i++) - centroid[i] = 0.0f; - - float w_total = 0f; - for (Integer v : vectors) { - w_total += weights.get(v); - } - - for (Integer v : vectors) { - float[] vec = data.get(v); - float weight = (float) weights.get(v) / (float) w_total; - for (int i = 0; i < d; i++) - centroid[i] += (vec[i] * weight); - } - return centroid; - } - - - ArrayList weightTotals; - - void updateMeans(List data) { - weightTotals = new ArrayList(); - if (means == null) { - means = new ArrayList(); - for (int i = 0; i < k; i++) - means.add(computerCentroid(clusters.get(i), data)); - } - for (int i = 0; i < k; i++) - means.set(i, computerCentroid(clusters.get(i), data)); - } - - int assignClusters(List data) { - int swaps = 0; - List> newClusters = new ArrayList>(); - for (int j = 0; j < k; j++) - newClusters.add(new ArrayList()); - - for (int clusterid = 0; clusterid < k; clusterid++) { - - for (Integer member : clusters.get(clusterid)) { - - int nearest = VectorUtil.findNearestDistance(data.get(member), - means); - newClusters.get(nearest).add(member); - if (nearest != clusterid) - swaps++; - } - - } - clusters = newClusters; - return swaps; - } - - - private void run() { - int maxiters = 1000; - int swaps = 2; - this.n = this.data.size(); - ArrayList workingdata = new ArrayList(); - // stuff for projected kmeans - Projector p = null; - Random r = new Random(); - if (projdim != 0) - p = new DBFriendlyProjection(this.data.get(0).length, projdim, - r.nextInt()); - for (float[] v : this.data) { - if (p != null) { - workingdata.add(p.project(v)); - } else - workingdata.add(v); - } - - int maxout = 0; - //loop until there are no more nullsets - boolean nullset = false; - do { - this.clusters = new ArrayList>(k); - // seed data with new clusters - ArrayList shufflelist = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - shufflelist.add(i); - - for (int i = 0; i < k; i++) { - List tmp = new LinkedList(); - tmp.add(shufflelist.remove(0)); - - for (int j = 1; j < workingdata.size() / k ; j++) { - int nxt = r.nextInt(shufflelist.size()); - tmp.add(shufflelist.remove(nxt)); - } - this.clusters.add(tmp); - } - - - cluster(maxiters, swaps, n, workingdata, clusters); - - nullset = false; - - for (List cluster : clusters) { - nullset |= (cluster.size() == 0); - } - - } while (nullset && ++maxout<100); - if (maxout == 100) - System.err.println("Warning: MaxIterations Reached Outer"); - - } - - public void cluster(int maxiters, int swaps, int n, - ArrayList workingdata, List> clusters) { - while (swaps > 0 && maxiters > 0) { - maxiters--; - updateMeans(workingdata); - swaps = assignClusters(workingdata); - } - if (maxiters == 0) - System.err.println("Warning: MaxIterations Reached"); - updateMeans(this.data); - } - */ - - public Kmeans() { - // TODO Auto-generated constructor stub - } - - @Override - public List getCentroids() { - // if (means == null) { - // run(); - -// Rengine re = Rengine.getMainEngine(); -// if(re == null) -// re = new Rengine(new String[] {"--no-save"}, false, null); - -// if (!re.waitForR()) -// System.out.println("Cannot load R"); - - ArrayList workingdata = new ArrayList(); - for (float[] v : this.data) - workingdata.add(v); - List kmeansCentroids = new ArrayList(); - - // Convert List data to a 2D array - float[][] matrix = new float[workingdata.size()][]; - matrix = workingdata.toArray(matrix); - - // Get the number of rows and columns of the 2D array - int rows = matrix.length; - String numRows = String.valueOf(rows); - - int cols = matrix[0].length; - String numCols = String.valueOf(cols); - - // Set k - String kAsString = String.valueOf(k); - - // Convert the 2D array to a 1D double array to feed into R - double[] oneDArray = flatten(matrix); - -// // Feed the 1D array, k and number of rows and columns to R -// re.assign("data", oneDArray); -// re.assign("numberOfRows", numRows); -// re.assign("numberOfCols", numCols); -// re.assign("k", kAsString); -// -// // Create the data matrix in R -// re.eval("dataMatrix <- matrix(data, nrow = as.numeric(numberOfRows), ncol = as.numeric(numberOfCols), byrow = TRUE)"); -// -// // Run k-means in R -// double[][] kmOut = re.eval("kmeans(dataMatrix, as.numeric(k), nstart = 25)$centers").asDoubleMatrix(); - - // Convert the 2D array back to List format -// for (int i = 0; i < kmOut.length; i++) { -// float[] vector = new float[kmOut[0].length]; -// for (int j = 0; j < kmOut[0].length; j++) -// vector[j] = (float) kmOut[i][j]; -// kmeansCentroids.add(vector); -// } -// re.end(); - // } - List l = new ArrayList<>(); - for(float[] f : kmeansCentroids) - l.add(new Centroid(f,0)); - return l; - } - - // Convert a 2D array to a 1D double array - public static double[] flatten(float[][] twoDArray) { - ArrayList oneDArray = new ArrayList(); - - for (int i = 0; i < twoDArray.length; i++) - for (int j = 0; j < twoDArray[i].length; j++) - oneDArray.add((double) twoDArray[i][j]); - - Double[] doubles = oneDArray.toArray(new Double[0]); - double[] d = ArrayUtils.toPrimitive(doubles); - - return d; - } - - @Override - public void reset(int randomseed) { - - } - - public static void main(String[] args) { - GenerateData gen = new GenerateData(8, 100, 100); - Kmeans kk = new Kmeans(5, gen.data(), 24); -// VectorUtil.prettyPrint(kk.getCentroids()); - } - - @Override - public RPHashObject getParam() { - return new SimpleArrayReader(this.data, k); - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/LloydIterativeKmeans.java b/src/main/java/edu/uc/rphash/tests/clusterers/LloydIterativeKmeans.java deleted file mode 100644 index 23ecf97..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/LloydIterativeKmeans.java +++ /dev/null @@ -1,250 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; -import edu.uc.rphash.projections.DBFriendlyProjection; -import edu.uc.rphash.projections.Projector; -import edu.uc.rphash.tests.generators.GenerateData; -import edu.uc.rphash.util.VectorUtil; - -public class LloydIterativeKmeans implements Clusterer { - int k; - int n; - List data; - public int getK() { - return k; - } - - public void setK(int k) { - this.k = k; - } - - public List getData() { - return data; - } - - - @Override - public void setRawData(List data) { - this.data = data; - } - - @Override - public void setData(List centroids) { - ArrayList data = new ArrayList(centroids.size()); - for(Centroid c : centroids)data.add(c.centroid()); - setRawData(data); - } - - public List getWeights() { - return weights; - } - - public void setWeights(List weights) { - this.weights = weights; - } - - int projdim; - - List means; - List> clusters; - List weights; - - public LloydIterativeKmeans(int k, List data) { - this.k = k; - this.data = data; - this.projdim = 0; - this.clusters = null; - this.weights = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - weights.add(1f); - means = null; - } - - public LloydIterativeKmeans(int k, List data, List weights) { - this.k = k; - this.data = data; - this.projdim = 0; - this.clusters = null; - this.weights = weights; - means = null; - } - - public LloydIterativeKmeans(int k, List data, int projdim) { - this.k = k; - this.data = data; - this.projdim = projdim; - this.clusters = null; - this.weights = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - weights.add(1f); - means = null; - } - - public LloydIterativeKmeans() { - // TODO Auto-generated constructor stub - } - - public float[] computeCentroid(List vectors, List data) { - int d = data.get(0).length; - float[] centroid = new float[d]; - - for (int i = 0; i < d; i++) - centroid[i] = 0.0f; - - float w_total = 0f; - for (Integer v : vectors) { - w_total += weights.get(v); - } - - for (Integer v : vectors) { - float[] vec = data.get(v); - float weight = (float) weights.get(v) / (float) w_total; - for (int i = 0; i < d; i++) - centroid[i] += (vec[i] * weight); - } - return centroid; - } - - ArrayList weightTotals; - - void updateMeans(List data) { - weightTotals = new ArrayList(); - if (means == null) { - means = new ArrayList(); - for (int i = 0; i < k; i++) - means.add(computeCentroid(clusters.get(i), data)); - } - for (int i = 0; i < k; i++) - means.set(i, computeCentroid(clusters.get(i), data)); - } - - int assignClusters(List data) { - int swaps = 0; - List> newClusters = new ArrayList>(); - for (int j = 0; j < k; j++) - newClusters.add(new ArrayList()); - - for (int clusterid = 0; clusterid < k; clusterid++) { - - for (Integer member : clusters.get(clusterid)) { - - int nearest = VectorUtil.findNearestDistance(data.get(member), - means); - newClusters.get(nearest).add(member); - if (nearest != clusterid) - swaps++; - } - - } - clusters = newClusters; - return swaps; - } - - private void run() { - int maxiters = 1000; - int swaps = 2; - this.n = this.data.size(); - ArrayList workingdata = new ArrayList(); - // stuff for projected kmeans - Projector p = null; - Random r = new Random(); - if (projdim != 0) - p = new DBFriendlyProjection(this.data.get(0).length, projdim, - r.nextInt()); - for (float[] v : this.data) { - if (p != null) { - workingdata.add(p.project(v)); - } else - workingdata.add(v); - } - - int maxout = 0; - //loop until there are no more nullsets - boolean nullset = false; - do { - this.clusters = new ArrayList>(k); - // seed data with new clusters - ArrayList shufflelist = new ArrayList(data.size()); - for (int i = 0; i < data.size(); i++) - shufflelist.add(i); - - for (int i = 0; i < k; i++) { - List tmp = new LinkedList(); - tmp.add(shufflelist.remove(0)); - - for (int j = 1; j < workingdata.size() / k ; j++) { - int nxt = r.nextInt(shufflelist.size()); - tmp.add(shufflelist.remove(nxt)); - } - this.clusters.add(tmp); - } - - - cluster(maxiters, swaps, n, workingdata, clusters); - - nullset = false; - - for (List cluster : clusters) { - nullset |= (cluster.size() == 0); - } - - } while (nullset && ++maxout<100); - if (maxout == 100) - System.err.println("Warning: MaxIterations Reached Outer"); - - } - - public void cluster(int maxiters, int swaps, int n, - ArrayList workingdata, List> clusters) { - while (swaps > 0 && maxiters > 0) { - maxiters--; - updateMeans(workingdata); - swaps = assignClusters(workingdata); - } - if (maxiters == 0) - System.err.println("Warning: MaxIterations Reached"); - updateMeans(this.data); - } - - @Override - public List getCentroids() { - if (means == null) - run(); - List centroids = new ArrayList<>(means.size()); - for(float[] f : means){ - centroids.add(new Centroid(f,0)); - } - return centroids; - } - - @Override - public void reset(int randomseed) { - means = null; - } - - public static void main(String[] args) { - GenerateData gen = new GenerateData(8, 100, 100); - LloydIterativeKmeans kk = new LloydIterativeKmeans(5, gen.data(), 24); -// VectorUtil.prettyPrint(kk.getCentroids()); - } - - @Override - public RPHashObject getParam() { - - return new SimpleArrayReader(this.data, k); - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/MLE2.java b/src/main/java/edu/uc/rphash/tests/clusterers/MLE2.java deleted file mode 100644 index 4519987..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/MLE2.java +++ /dev/null @@ -1,333 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.Readers.SimpleArrayReader; - -/** - * @author lee - * learns mle model with T topics from words x docs counts data - */ -/* - * All private methods are so because they for speed reasons do not employ any - * form of checking for numerical stability. In the case of mle matrices this is - * acceptable as probability matrices are never negative and the dimensions of - * the matrices do not change. - * for a unit test of mle, mle will be used to produce the NMF of a matrix, functionality - * and correctness can be confirmed by finding the product to be equal to the input - * matrix - */ -public class MLE2 implements Clusterer { - - - - /** - * @param args - */ - public static void main(String[] args) { - float[][] F = {{1f, 0f, 0f, 2f, 0f},{1f, 1f, 1f, 0f, 0f}, {0f, 0f, 1f, 1f , 1f},{2f, 1f, 1f, 0f, 1f},{1f, 0f, 0f, 2f, 0f},{1f, 1f, 1f, 0f, 0f}, {0f, 0f, 1f, 1f , 1f},{2f, 1f, 1f, 0f, 1f}}; - for(float[] ff: F){ - System.out.println(); - for(float f: ff){ - System.out.print(f + " "); - } - }System.out.println(); - - - MLE2 mlobj = new MLE2(Arrays.asList(F),4,.00001f); - - printmat(normalize(F)); - printmat(mlobj.wt);printmat(mlobj.td); - printmat( multiply(mlobj.wt,mlobj.td)); - - } - - - int W;//words, rows - int D;//documents, columns - int T;// topics or latent classes - - public float[][] td; - public float[][] wt; - Listdata; - - public MLE2(List counts, int T,float epsilon) - { - this.data = counts; - W=counts.size(); - D = counts.get(0).length; - this.T = T; - mle(epsilon); - } - - // use if you want wt initialized to some specific value - public void mle( float epsilon) - { - float tot = sum(data); - td = normalize(ones(T,D)); - wt = normalize(rand(W,T)); - - - float[] E = sum1D(logDotProduct(data,multiply(wt,td))); - float F = sum(E)/tot; - float F_new ; - float rel_ch; - - - do - { - // Expectation Step - // td = norm(td .* ( wt' * ( counts ./ (wt * td) ) )); - td = normalize(dotProduct(td,(multiply(transpose(wt),dotDivide(data,multiply(wt,td)))))); - - //maximization step - //wt = normalize( wt .* ( ( counts ./ ( wt * td + eps ) ) * td' )) - wt = normalize(dotProduct(wt,multiply(dotDivide(data,multiply(wt,td)),transpose(td)))); - - //calculate log-likelihood - /* - * ___ ___ - * \ \ - * /__ /__ n(d,w) log P(d,w) - * d c D w c W - */ - E = sum1D(logDotProduct(data,multiply(wt,td))); - F_new = sum(E)/tot; - - //calculate iteration's relative change to determine convergence - rel_ch = Math.abs((F_new - F))/ Math.abs(F); - F= F_new; - - System.out.println(rel_ch); - - }while(rel_ch>epsilon); - - } - - - //testing status - works - //gets the pairwise products of two matrices - //no dimension checking - private static float[][] dotProduct(float[][] mat1, float[][] mat2) - { - float[][] rtrn = new float [mat1.length ][mat1[0].length]; - - for(int i = 0;i mat1, float[][] mat2) - { - float[][] rtrn = new float [mat1.size() ][mat1.get(0).length]; - - for(int i = 0;i mat1, float[][] mat2) - { - float[][] rtrn = new float [mat1.size() ][mat1.get(0).length]; - - for(int i = 0;i A) - { - float sum =0; - for(float[] ff:A){ - for(float f:ff) sum+=f; - } - return sum; - } - - //testing status - works - //give the sum of all the elements of a vector - public static float sum(float[]A) - { - float sum =0; - for(float f:A) sum+=f; - return sum; - } - - //testing status - works - //give the column vector sum of all the elements of a matrix - public static float[] sum1D(float[][] A) - { - float[] sum = new float[A[0].length]; - Arrays.fill(sum, 0f); - for(float[] ff:A) - { - for(int i = 0;i getCentroids() { - - // TODO Auto-generated method stub - return null; - } - - @Override - public RPHashObject getParam() { - return new SimpleArrayReader(this.data, T); - } - - @Override - public void setWeights(List counts) { - - } - - @Override - public void setRawData(List data) { - this.data = data; - } - - @Override - public void setData(List centroids) { - ArrayList data = new ArrayList(centroids.size()); - for(Centroid c : centroids)data.add(c.centroid()); - setRawData(data); - } - - @Override - public void setK(int getk) { - } - - @Override - public void reset(int randomseed) { - - } - - @Override - public boolean setMultiRun(int runs) { - return false; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/MaxLikelihoodKMeans2.java b/src/main/java/edu/uc/rphash/tests/clusterers/MaxLikelihoodKMeans2.java deleted file mode 100644 index e3f72fa..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/MaxLikelihoodKMeans2.java +++ /dev/null @@ -1,451 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Vector; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.Clusterer; -import edu.uc.rphash.RPHashStream; -import edu.uc.rphash.Readers.RPHashObject; -import edu.uc.rphash.tests.StatTests; -import edu.uc.rphash.tests.generators.GenerateStreamData; - -public class MaxLikelihoodKMeans2 implements Clusterer { - - class PointND { - - private int dimension; // number of coordinates of a point - private float[] coordinates; // the coordinates of a point - - /** - * Create a point centered at the origin of the specific dimension - **/ - public PointND(int dimension) { - this.dimension = dimension; - coordinates = new float[dimension]; - } - - public PointND(float[] data) { - this.dimension = data.length; - coordinates = data; - } - - /** - * Create a new point identical to point p - **/ - public PointND(PointND p) { - this.dimension = p.dimension; - this.coordinates = new float[dimension]; - for (int i = 0; i < dimension; i++) - this.coordinates[i] = p.coordinates[i]; - } - } - - private int n; // number of instances to classify - private int d; // number of coordinates of each point - private int k; // number of clusters - private PointND[] mu; // coordinate of means mu[j] of each cluster j - private Vector[] w; // holds the points classified into each class - // w[j] - private PointND[] sigma; // holds the standard deviation of each class i - private float[] prior; // holds the prior of each class i - // private float logLikelihood; // holds the log likelihood of each of the k - // Gaussians - private float MDL; // the minimum description length of the model - private int numIterations; - - private List centroids; - private PointND[] data; - - public MaxLikelihoodKMeans2(int getk, List data) { - this.data = new PointND[data.size()]; - for (int i = 0; i < data.size(); i++) { - this.data[i] = new PointND(data.get(i)); - } - this.centroids = null; - init(this.data, getk); - } - - public MaxLikelihoodKMeans2() { - } - - /** - * Intialize the parameters of the k-means algorithm Randomly assign a point - * in x to each mean mu[j] - **/ - private void init(PointND[] x, int k) { - this.n = x.length; - this.d = x[0].dimension; - this.k = k; - this.mu = new PointND[k]; - this.w = new Vector[k]; - this.numIterations = 0; - this.sigma = new PointND[k]; - this.prior = new float[k]; - - // randomly assign a point in x to each mean mu[j] - PointND randomPoint; - for (int j = 0; j < k; j++) { - randomPoint = x[(int) (Math.random() * (n - 1))]; - mu[j] = new PointND(randomPoint); - // each prior and standard deviation are set to zero - sigma[j] = new PointND(d); - prior[j] = 0; - } - } - - /** - * Runs the k-means algorithm with k clusters on the set of instances x Then - * find the quality of the model - **/ - public void run(PointND[] x, int k, float epsilon) { - float maxDeltaMeans = epsilon + 1; - PointND[] oldMeans = new PointND[k]; - // initialize n,k,mu[j] - init(x, k); - // iterate until there is no change in mu[j] - while (maxDeltaMeans > epsilon) { - // remember old values of the each mean - for (int j = 0; j < k; j++) { - oldMeans[j] = new PointND(mu[j]); - - } - - // classify each instance x[i] to its nearest class - // first we need to clear the class array since we are reclassifying - for (int j = 0; j < k; j++) { - w[j] = new Vector(); // could use clear but then have - // to init... - } - - for (int i = 0; i < n; i++) { - classify(x[i]); - } - // recompute each mean - computeMeans(); - // compute the largest change in mu[j] - maxDeltaMeans = maxDeltaMeans(oldMeans); - numIterations++; - } - } - - /** - * Classifies the point x to the nearest class - **/ - private void classify(PointND x) { - float dist = 0; - float smallestDist; - int nearestClass; - - // compute the distance x is from mean mu[0] - smallestDist = distance(x.coordinates, mu[0].coordinates); - nearestClass = 0; - - // compute the distance x is from the other classes - for (int j = 1; j < k; j++) { - dist = distance(x.coordinates, mu[j].coordinates); - if (dist < smallestDist) { - smallestDist = dist; - nearestClass = j; - } - } - // classify x into class its nearest class - w[nearestClass].add(x); - } - - float distance(float[] x, float[] y) { - float ret = 0.0f; - if (x.length != y.length) - return Float.MAX_VALUE; - for (int i = 0; i < x.length; i++) - ret += (x[i] - y[i]) * (x[i] - y[i]); - return (float) Math.sqrt(ret); - } - - float[] subtract(float[] x, float[] y) { - float[] ret = new float[x.length]; - if (x.length != y.length) - return null; - for (int i = 0; i < x.length; i++) - ret[i] = x[i] - y[i]; - return ret; - } - - float[] add(float[] x, float[] y) { - float[] ret = new float[x.length]; - if (x.length != y.length) - return null; - for (int i = 0; i < x.length; i++) - ret[i] = x[i] + y[i]; - return ret; - } - - float[] multiply(float[] x, float scalar) { - float[] ret = new float[x.length]; - for (int i = 0; i < x.length; i++) - ret[i] = x[i] * scalar; - return ret; - } - - public float max(float[] coordinates) { - float value; - float max = coordinates[0]; - for (int i = 1; i < coordinates.length; i++) { - value = coordinates[i]; - if (value > max) - max = value; - } - return max; - } - - /** - * Recompute mu[j] as the average of all points classified to the class w[j] - **/ - private void computeMeans() { - int numInstances; // number of instances in each class w[j] - PointND instance; - - // init the means to zero - for (int j = 0; j < k; j++) - mu[j] = new PointND(mu[j].dimension); - - // recompute the means of each cluster - for (int j = 0; j < k; j++) { - numInstances = w[j].size(); - for (int i = 0; i < numInstances; i++) { - instance = w[j].get(i); - mu[j] = new PointND( - add(mu[j].coordinates, instance.coordinates)); - // mu[j].add(instance); - } - // mu[j].multiply(1.0f / numInstances); - mu[j] = new PointND( - multiply(mu[j].coordinates, 1.0f / numInstances)); - } - - } - - /** - * Compute the maximum change over each mean mu[j] - **/ - private float maxDeltaMeans(PointND[] oldMeans) { - float delta; - oldMeans[0] = new PointND(subtract(oldMeans[0].coordinates, - mu[0].coordinates)); - // oldMeans[0].subtract(mu[0]); - - float maxDelta = max(oldMeans[0].coordinates); - for (int j = 1; j < k; j++) { - // oldMeans[j].subtract(mu[j]); - oldMeans[j] = new PointND(subtract(oldMeans[j].coordinates, - mu[j].coordinates)); - delta = max(oldMeans[j].coordinates); - if (delta > maxDelta) - maxDelta = delta; - } - return maxDelta; - } - - // /** - // * Compute the standard deviation of the k Gaussians - // **/ - // private void computeDeviation() { - // int numInstances; // number of instances in each class w[j] - // PointND instance; - // PointND temp; - // - // // set the standard deviation to zero - // for (int j = 0; j < k; j++) - // sigma[j].setToOrigin(); - // - // // for each cluster j... - // for (int j = 0; j < k; j++) { - // numInstances = w[j].size(); - // for (int i = 0; i < numInstances; i++) { - // instance = (PointND) (w[j].get(i)); - // temp = new PointND(instance); - // temp.subtract(mu[j]); - // temp.pow(2.0f); // (x[i]-mu[j])^2 - // temp.multiply(1.0f / numInstances); // multiply by proba of - // // having x[i] in cluster j - // sigma[j].add(temp); // sum i (x[i]-mu[j])^2 * p(x[i]) - // } - // sigma[j].pow((1.0f / 2f)); // because we want the standard deviation - // } - // } - // - // /** - // * Compute the priors of the k Gaussians - // **/ - // private void computePriors() { - // float numInstances; // number of instances in each class w[j] - // for (int j = 0; j < k; j++) { - // numInstances = w[j].size() * (1.0f); - // prior[j] = numInstances / n; - // } - // } - // - // /** - // * Assume the standard deviations and priors of each cluster have been - // * computed - // **/ - // private void computeLogLikelihood(PointND[] x) { - // float temp1 = 0; - // float temp2 = 0; - // PointND variance; - // float ln2 = (float) Math.log(2); - // // for each instance x - // for (int i = 0; i < n; i++) { - // // for each cluster j - // temp1 = 0; - // for (int j = 0; j < k; j++) { - // temp1 = temp1 + (x[i].normal(mu[j], sigma[j]) * prior[j]); - // } - // temp2 = (float) (temp2 + Math.log(temp1) / ln2); - // } - // logLikelihood = temp2; - // } - // - // /** - // * Assume the log likelihood and priors have been computed - // **/ - // private void computeMDL() { - // float temp = 0; - // float numInstances; - // float ln2 = (float) Math.log(2); - // for (int j = 0; j < k; j++) { - // numInstances = w[j].size(); - // for (int i = 0; i < d; i++) { - // temp = (float) (temp - Math.log(sigma[j].getCoordinate(i) - // / Math.sqrt(numInstances)) - // / ln2); - // } - // } - // MDL = temp - logLikelihood; - // } - - public float getMDL() { - return MDL; - } - - public List getCentroids() { - float epsilon = 0.01f; - if (centroids != null){ - - return centroids; - } - init(data, k); - run(data, d, epsilon); - centroids = new ArrayList(k); - for (int i = 0; i < k; i++) - centroids.add(new Centroid(mu[i].coordinates,0)); - - // compute sum of squares - double sigtotal = 0.0; - for (int i = 0; i < sigma.length; i++) - for (int j = 0; j < sigma[i].dimension; j++) - sigtotal += sigma[i].coordinates[j]; - - return centroids; - } - - @Override - public RPHashObject getParam() { - // TODO Auto-generated method stub - return null; - } - - @Override - public void setWeights(List counts) { - - } - - @Override - public void setData(List data) { - this.centroids = null; - this.data = new PointND[data.size()]; - for (int i = 0; i < data.size(); i++) { - this.data[i] = new PointND(data.get(i).centroid()); - } - } - @Override - public void setRawData(List data) { - this.centroids = null; - this.data = new PointND[data.size()]; - for (int i = 0; i < data.size(); i++) { - this.data[i] = new PointND(data.get(i)); - } - } - - @Override - public void setK(int getk) { - this.k = getk; - } - - @Override - public void reset(int randomseed) { - centroids = null; - } - - public static void main(String[] args) { - int k = 10; - int d = 240; - float var = 1f; - int interval = 1000; - Runtime rt = Runtime.getRuntime(); - - GenerateStreamData gen1 = new GenerateStreamData(k, d, var, 11331313); - GenerateStreamData noise = new GenerateStreamData(1, d, var * 10, - 11331313); - MaxLikelihoodKMeans2 km2 = new MaxLikelihoodKMeans2(); - // HartiganWongKMeans hwkm = new HartiganWongKMeans(); - - System.out.printf("\tKMeans\t\t\tNull\t\tReal\n"); - System.out - .printf("Vecs\tMem(KB)\tTime\tWCSSE\t\tTime\tWCSSE\t\tWCSSE\n"); - - long timestart = System.nanoTime(); - for (int i = 0; i < 2500000;) { - ArrayList vecsAndNoiseInThisRound = new ArrayList(); - ArrayList justvecsInThisRound = new ArrayList(); - - for (int j = 1; j < interval && i < 2500000; i++, j++) { - float[] vec = gen1.generateNext(); - vecsAndNoiseInThisRound.add(vec); - justvecsInThisRound.add(vec); - vecsAndNoiseInThisRound.add(noise.generateNext()); - } - - timestart = System.nanoTime(); - km2.setRawData(vecsAndNoiseInThisRound); - km2.setK(k); - - List cents = km2.getCentroids(); - long time = System.nanoTime() - timestart; - - rt.gc(); - long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - - double wcsse = StatTests.WCSSECentroidsFloat(cents, justvecsInThisRound); - double realwcsse = StatTests.WCSSE(gen1.medoids, - justvecsInThisRound); - System.out.printf("%d\t%d\t%.4f\t%.1f\t\t", i, usedkB, - time / 1000000000f, wcsse); - - cents = new HartiganWongKMeans(k, vecsAndNoiseInThisRound) - .getCentroids(); - time = System.nanoTime() - timestart; - usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - wcsse = StatTests.WCSSECentroidsFloat(cents, justvecsInThisRound); - System.out.printf("%.4f\t%.1f\t\t%.1f\n", time / 1000000000f, - wcsse, realwcsse); - } - } - @Override - public boolean setMultiRun(int runs) { - return false; - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/clusterers/SVD.java b/src/main/java/edu/uc/rphash/tests/clusterers/SVD.java deleted file mode 100644 index da67542..0000000 --- a/src/main/java/edu/uc/rphash/tests/clusterers/SVD.java +++ /dev/null @@ -1,456 +0,0 @@ -package edu.uc.rphash.tests.clusterers; - -import java.util.ArrayList; -import java.util.Collections; - -public class SVD { - /** - * returns U in a. normaly U is nr*nr, but if nr>nc only the first nc - * columns are returned (nice, saves memory). The columns of U have - * arbitrary sign, also the columns corresponding to near-zero singular - * values can vary wildly from other implementations. - *This function is adapted from the c coded method from Numerical Recipes in C - */ - - private float[][] A; - SVDMatrix svdmat; - - public SVD(float[][] A) - { - this.A=A; - } - - public void compute() - { - - - - float[] D = new float[A[0].length]; - float[][] V = new float[A[0].length][A[0].length]; - - svdmat = new SVDMatrix(A,D,V,A.length < A[0].length); - - svd(svdmat.getU(),svdmat.getD(),svdmat.getV()); - } - - public float[][] getU(){ - return svdmat.getV(); - } - - public float[][] getD() - { - return padV(svdmat.getD()); - } - - public float[][] getVT(){ - return transpose(svdmat.getV()); - } - - public void svd(float[][] a, float[] w, float[][] v) { - int i, its, j, jj, k, l = 0, nm = 0; - boolean flag; - int m = a.length; - int n = a[0].length; - float c, f, h, s, x, y, z; - float anorm = 0.f, g = 0.f, scale = 0.f; - float[] rv1 = new float[n]; - - for (i = 0; i < n; i++) { - l = i + 1; - rv1[i] = scale * g; - g = s = scale = 0.f; - if (i < m) { - for (k = i; k < m; k++) - scale += abs(a[k][i]); - if (scale != 0.0) { - for (k = i; k < m; k++) { - a[k][i] /= scale; - s += a[k][i] * a[k][i]; - } - f = a[i][i]; - g = -SIGN((float)Math.sqrt(s), f); - h = f * g - s; - a[i][i] = f - g; - // if (i!=(n-1)) { // CHECK - for (j = l; j < n; j++) { - for (s = 0, k = i; k < m; k++) - s += a[k][i] * a[k][j]; - f = s / h; - for (k = i; k < m; k++) - a[k][j] += f * a[k][i]; - } - // } - for (k = i; k < m; k++) - a[k][i] *= scale; - } - } - w[i] = scale * g; - g = s = scale = 0.0f; - if (i < m && i != n - 1) { // - for (k = l; k < n; k++) - scale += abs(a[i][k]); - if (scale != 0.) { - for (k = l; k < n; k++) { // - a[i][k] /= scale; - s += a[i][k] * a[i][k]; - } - f = a[i][l]; - g = -SIGN((float)Math.sqrt(s), f); - h = f * g - s; - a[i][l] = f - g; - for (k = l; k < n; k++) - rv1[k] = a[i][k] / h; - if (i != m - 1) { // - for (j = l; j < m; j++) { // - for (s = 0, k = l; k < n; k++) - s += a[j][k] * a[i][k]; - for (k = l; k < n; k++) - a[j][k] += s * rv1[k]; - } - } - for (k = l; k < n; k++) - a[i][k] *= scale; - } - } // i= 0; --i) { - if (i < n - 1) { // - if (g != 0.) { - for (j = l; j < n; j++) - v[j][i] = (a[i][j] / a[i][l]) / g; - for (j = l; j < n; j++) { - for (s = 0, k = l; k < n; k++) - s += a[i][k] * v[k][j]; - for (k = l; k < n; k++) - v[k][j] += s * v[k][i]; - } - } - for (j = l; j < n; j++) - // - v[i][j] = v[j][i] = 0.0f; - } - v[i][i] = 1.0f; - g = rv1[i]; - l = i; - } - // for (i=IMIN(m,n);i>=1;i--) { // ! - // for (i = n-1; i>=0; --i) { - for (i = Math.min(m - 1, n - 1); i >= 0; --i) { - l = i + 1; - g = w[i]; - if (i < n - 1) // - for (j = l; j < n; j++) - // - a[i][j] = 0.0f; - if (g != 0.) { - g = 1.f / g; - if (i != n - 1) { - for (j = l; j < n; j++) { - for (s = 0, k = l; k < m; k++) - s += a[k][i] * a[k][j]; - f = (s / a[i][i]) * g; - for (k = i; k < m; k++) - a[k][j] += f * a[k][i]; - } - } - for (j = i; j < m; j++) - a[j][i] *= g; - } else { - for (j = i; j < m; j++) - a[j][i] = 0.0f; - } - a[i][i] += 1.0; - } - for (k = n - 1; k >= 0; --k) { - for (its = 1; its <= 30; ++its) { - flag = true; - for (l = k; l >= 0; --l) { - nm = l - 1; - if ((abs(rv1[l]) + anorm) == anorm) { - flag = false; - break; - } - if ((abs(w[nm]) + anorm) == anorm) - break; - } - if (flag) { - c = 0.0f; - s = 1.0f; - for (i = l; i <= k; i++) { // - f = s * rv1[i]; - rv1[i] = c * rv1[i]; - if ((abs(f) + anorm) == anorm) - break; - g = w[i]; - h = pythag(f, g); - w[i] = h; - h = 1.0f / h; - c = g * h; - s = -f * h; - for (j = 0; j < m; j++) { - y = a[j][nm]; - z = a[j][i]; - a[j][nm] = y * c + z * s; - a[j][i] = z * c - y * s; - } - } - } // flag - z = w[k]; - if (l == k) { - if (z < 0.) { - w[k] = -z; - for (j = 0; j < n; j++) - v[j][k] = -v[j][k]; - } - break; - } // l==k - x = w[l]; - nm = k - 1; - y = w[nm]; - g = rv1[nm]; - h = rv1[k]; - f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2 * h * y); - g = pythag(f, 1.0f); - f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x; - c = s = 1.0f; - for (j = l; j <= nm; j++) { - i = j + 1; - g = rv1[i]; - y = w[i]; - h = s * g; - g = c * g; - z = pythag(f, h); - rv1[j] = z; - c = f / z; - s = h / z; - f = x * c + g * s; - g = g * c - x * s; - h = y * s; - y *= c; - for (jj = 0; jj < n; jj++) { - x = v[jj][j]; - z = v[jj][i]; - v[jj][j] = x * c + z * s; - v[jj][i] = z * c - x * s; - } - z = pythag(f, h); - w[j] = z; - if (z != 0.0) { - z = 1.0f / z; - c = f * z; - s = h * z; - } - f = c * g + s * y; - x = c * y - s * g; - for (jj = 0; jj < m; ++jj) { - y = a[jj][j]; - z = a[jj][i]; - a[jj][j] = y * c + z * s; - a[jj][i] = z * c - y * s; - } - } // j= 0. ? abs(a) : -abs(a)); - } - - //creates a diagonal matrix by padding the vector(v) with zeros - public static float[][] padV(float[] v) - { - float rtrn[][] = new float[v.length][v.length]; - for(int i =0;i svdpairs; - float[] D; - float[][] U; - float[][]V; - boolean sorted; - - - public SVDMatrix(float[][] u, float[] d, float[][]v, boolean transpose) - { - transpose = false; - sorted = false; - - /* if(transpose){ - D=d; - U=transpose(v); - V=u; - }else - { - */ D=d; - U=u; - V=v; - // } - } - - public void sortSingularValues() - { - svdpairs = new ArrayList(D.length); - - for(int i = 0; i< D.length;i++){ - float urow[] = new float[U[0].length]; - float vrow[] = new float[V[0].length]; - - for(int j = 0; j { - float singularValue; - float[] urows; - float[] vrows; - - public SVDValuePairs(float singularValue, float[] urows,float[] vrows){ - this.singularValue = singularValue; - this.vrows = vrows; - this.urows = urows; - } - - public int compareTo(SVDValuePairs o) { - if(o instanceof SVDValuePairs) - return 0; - if(((SVDValuePairs)o).singularValue == this.singularValue)return 0; - if(((SVDValuePairs)o).singularValue > this.singularValue)return 1; - return -1; - } - } - } -} diff --git a/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java b/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java index 5e81e50..d0f374e 100644 --- a/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java +++ b/src/main/java/edu/uc/rphash/tests/generators/GenerateData.java @@ -517,6 +517,7 @@ public List getData() { return data; } + @Override public List getLabels() { // TODO Auto-generated method stub @@ -569,11 +570,11 @@ public static void main(String[] args) throws NumberFormatException, List truncatedArgs = new ArrayList(); Map taggedArgs = argsUI(args, truncatedArgs); - int k = 10; - int d = 1000; - int n = 20000; - float var = 1f; - float sparseness = 1f; + int k = 20; + int d = 200; + int n = 10000; + float var = 0.8f; //1.0f; + float sparseness = 1.0f; //1f; boolean shuffle = true; boolean raw = false; @@ -585,7 +586,8 @@ public static void main(String[] args) throws NumberFormatException, if(taggedArgs.containsKey("shuffled"))shuffle = Boolean.parseBoolean(taggedArgs.get("shuffled")); if(taggedArgs.containsKey("raw"))raw = Boolean.parseBoolean(taggedArgs.get("raw")); - File outputFile = new File(args[0]+"_"+k+"x"+d+"x"+n+".mat"); + //File outputFile = new File(args[0]+"_"+k+"x"+d+"x"+n+".txt"); // ".mat" + File outputFile = new File(args[0] +"1D"+".txt"); File lblFile = new File(args[0]+"_"+k+"x"+d+"x"+n+".lbl"); System.out.printf("k=%d, n=%d, d=%d, var=%f, sparseness=%f %s > %s",k,n, diff --git a/src/main/java/edu/uc/rphash/tests/plotting.java b/src/main/java/edu/uc/rphash/tests/plotting.java new file mode 100644 index 0000000..2bebfac --- /dev/null +++ b/src/main/java/edu/uc/rphash/tests/plotting.java @@ -0,0 +1,86 @@ +package edu.uc.rphash.tests; + +import java.util.LinkedList; +import java.util.List; +import java.util.Random; + +//import org.knowm.xchart.*; +import org.knowm.xchart.XYSeries.XYSeriesRenderStyle; +import org.knowm.xchart.style.Styler.LegendPosition; +import org.knowm.xchart.style.markers.SeriesMarkers; +import org.knowm.xchart.internal.Utils; +import org.knowm.xchart.QuickChart; +import org.knowm.xchart.SwingWrapper; +import org.knowm.xchart.XYChart; +import org.knowm.xchart.XYChartBuilder; +import org.knowm.xchart.XYSeries; + +import edu.uc.rphash.tests.generators.GenerateData; + +public class plotting { + + static Random random = new Random(); + + private static List getGaussian(int number, double mean, double std) { + + List seriesData = new LinkedList(); + for (int i = 0; i < number; i++) { + seriesData.add(mean + std * random.nextGaussian()); + } + + return seriesData; + + } + +public static void main(String[] args) { +double[] xData = new double[] { 0.0, 1.0, 2.0 }; +double[] yData = new double[] { 2.0, 1.0, 0.0 }; + +// Create Chart +XYChart chart = QuickChart.getChart("Sample Chart", "X", "Y", "y(x)", xData, yData); + +// Show it +new SwingWrapper(chart).displayChart(); + + +//Create Chart2 +XYChart chart2 = new XYChartBuilder().width(600).height(500).title("Gaussian Blobs").xAxisTitle("X").yAxisTitle("Y").build(); + +//Customize Chart2 +chart2.getStyler().setDefaultSeriesRenderStyle(XYSeriesRenderStyle.Scatter); +//chart2.getStyler().setChartTitleVisible(false); +//chart2.getStyler().setLegendPosition(LegendPosition.InsideSW); +chart2.getStyler().setMarkerSize(16); + +//Series + +int k = 10;//6; +int d = 2;//16; +int n = 10000; +float var = 1.5f; +GenerateData gen = new GenerateData(k, n/k, d, var, true, .5f); + +chart2.addSeries("Gaussian Blob 1", getGaussian(1000, 5, 1), getGaussian(1000, 5, 1)); + +XYSeries series2 = chart2.addSeries("Gaussian Blob 2", getGaussian(1000, 50, 1), getGaussian(1000, 50, 1)); + +XYSeries series3 = chart2.addSeries("Gaussian Blob 3", getGaussian(1000, 5, 1), getGaussian(1000, 50, 1)); + +XYSeries series4 = chart2.addSeries("Gaussian Blob 4", getGaussian(1000, 50, 1), getGaussian(1000, 5, 1)); + +XYSeries series5 = chart2.addSeries("Gaussian Blob 5", getGaussian(1000, 25, 1), getGaussian(1000, 25, 1)); + +//chart2.addSeries("Gaussian Blob 2", getDoubleArrayFromNumberList​(gen.getData()), getDoubleArrayFromNumberList​(gen.getData())); + +//XYSeries series = chart2.addSeries("Gaussian Blob 2", getDoubleArrayFromNumberList​(gen.getData()), getDoubleArrayFromNumberList​(gen.getData())); + +//series2.setMarker(SeriesMarkers.DIAMOND); + + +new SwingWrapper(chart2).displayChart(); + + } + + + +} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/tests/testStreamingRPHash.java b/src/main/java/edu/uc/rphash/tests/testStreamingRPHash.java deleted file mode 100644 index ac62432..0000000 --- a/src/main/java/edu/uc/rphash/tests/testStreamingRPHash.java +++ /dev/null @@ -1,184 +0,0 @@ -package edu.uc.rphash.tests; - -import java.util.ArrayList; -import java.util.List; -import java.util.Random; - -import edu.uc.rphash.Centroid; -import edu.uc.rphash.RPHashMultiProj; -import edu.uc.rphash.RPHashStream; -import edu.uc.rphash.tests.clusterers.StreamingKmeans; -import edu.uc.rphash.tests.clusterers.StreamingKmeans2; -import edu.uc.rphash.tests.generators.GenerateStreamData; -import edu.uc.rphash.util.VectorUtil; - -public class testStreamingRPHash { - public static void readFileData(String[] args) throws Exception { - - int interval = 1000; - int k = 10; - String filename = "/home/lee/Desktop/Dimension3204/data.mat"; - int processors = Runtime.getRuntime().availableProcessors(); - if (args.length > 1) - filename = args[0]; - if (args.length > 2) - k = Integer.parseInt(args[1]); - if (args.length > 3) - processors = Integer.parseInt(args[0]); - - Runtime rt = Runtime.getRuntime(); - List data = VectorUtil.readFile(filename, false); - - RPHashStream rphit = new RPHashStream(data, k); - - // System.out.printf("Running Streaming RPHash on %d processors, d=%d,k=%d,n=%d\n",rphit.getProcessors(),d,k,interval); - // StreamClusterer rphit = new StreamingKmeans(data, k); - // System.out.printf("Running Streaming KMeans on %d processors, d=%d,k=%d\n",1,data.size(),k); - - System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\n"); - long timestart = System.nanoTime(); - - timestart = System.nanoTime(); - rphit.addVectorOnlineStep(data.get(0)); - for (int i = 1; i < 20000; i++) { - rphit.addVectorOnlineStep(data.get(i)); - - if (i % interval == 0) { - List cents = rphit.getCentroidsOfflineStep(); - long time = System.nanoTime() - timestart; - - rt.gc(); - long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - - double wcsse = StatTests.WCSSECentroidsFloat(cents, data); - - System.gc(); - System.out.printf("%d\t%d\t%.4f\t%.0f\n", i, usedkB, - time / 1000000000f, wcsse); - timestart = System.nanoTime(); - } - } - - } - - public static void generateAndStream() throws InterruptedException { - Random r = new Random(); - int k = 10; - int d = 1000; - float var = 1f; - int interval = 10000; - Runtime rt = Runtime.getRuntime(); - - GenerateStreamData gen1 = new GenerateStreamData(k, d, var, 11331313); - GenerateStreamData noise = new GenerateStreamData(1, d, var*10, 11331313); - RPHashStream rphit = new RPHashStream(k, gen1,rt.availableProcessors()); - StreamingKmeans2 skmi = new StreamingKmeans2(k, gen1,rt.availableProcessors()); - System.out.printf("\tStreamingRPHash\t\t\tStreamingKmeans\t\tReal\n"); - System.out.printf("Vecs\tMem(KB)\tTime\tWCSSE\t\tTime\tWCSSE\t\tWCSSE\n"); - - long timestart = System.nanoTime(); - for (int i = 0; i < 2500000;) { - ArrayList vecsAndNoiseInThisRound = new ArrayList(); - ArrayList justvecsInThisRound = new ArrayList(); - - for (int j = 1; j < interval && i < 2500000; i++, j++){ - float[] vec = gen1.generateNext(); - vecsAndNoiseInThisRound.add(vec); - justvecsInThisRound.add(vec); - if(r.nextInt(10)==1) - vecsAndNoiseInThisRound.add(noise.generateNext()); - } - - timestart = System.nanoTime(); - for (float[] f : vecsAndNoiseInThisRound) { - rphit.addVectorOnlineStep(f); - } - List cents = rphit.getCentroidsOfflineStep(); - long time = System.nanoTime() - timestart; - - rt.gc(); - long usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - - double wcsse = StatTests.WCSSECentroidsFloat(cents, justvecsInThisRound); - double realwcsse = StatTests.WCSSE(gen1.medoids, justvecsInThisRound); - - System.out.printf("%d\t%d\t%.4f\t%.1f\t\t", i, usedkB, - time / 1000000000f, wcsse); - rt.gc(); - Thread.sleep(1000); - rt.gc(); - - timestart = System.nanoTime(); - for (float[] f : vecsAndNoiseInThisRound) { - skmi.addVectorOnlineStep(f); - } - - cents = skmi.getCentroidsOfflineStep(); - time = System.nanoTime() - timestart; - - rt.gc(); - usedkB = (rt.totalMemory() - rt.freeMemory()) / 1024; - - wcsse = StatTests.WCSSECentroidsFloat(cents, justvecsInThisRound); - // recreate vectors at execution time to check average - rt.gc(); - Thread.sleep(1000); - rt.gc(); - - System.out.printf("%.4f\t%.1f\t\t%.1f\n",time/ 1000000000f,wcsse,realwcsse); - } - } - - public static void streamingPushtest() { - int k = 10; - int d = 1000; - float var = 4.5f; - - GenerateStreamData gen1 = new GenerateStreamData(k, d, var, 11331313); - - RPHashStream rphit = new RPHashStream(k,gen1); - - ArrayList cts = new ArrayList(); - for (int i = 0; i < 10000; i++) { - long centroidCount = rphit.addVectorOnlineStep(gen1.generateNext()); -// if (centroidCount>1 ) { -// cts.add((int) centroidCount); -// List f = rphit.getTopIdSizes(); -// for (float ff : f) -// System.out.print(ff/(float)i + ","); -// System.out.print("]\n["); -// } - } - //System.out.println(cts.toString()); - } - - public static void main(String[] args) throws Exception { -// readFileData(args); - generateAndStream(); -// streamingPushtest(); - } - - static void prettyPrint(List cs){ - - int n = cs.get(0).centroid.length; - boolean curtailm = n > 10; - if (curtailm) { - for (int i = 0; i < 4; i++) { - VectorUtil.prettyPrint(cs.get(i).centroid); - } - for (int j = 0; j < n / 2; j++) - System.out.print("\t"); - System.out.print(" ...\n"); - for (int i = cs.size() - 4; i < cs.size(); i++) { - VectorUtil.prettyPrint(cs.get(i).centroid); - } - } else { - for (int i = 0; i < cs.size(); i++) { - VectorUtil.prettyPrint(cs.get(i).centroid); - System.out.print("\n"); - } - } - - } - -} diff --git a/src/main/java/edu/uc/rphash/tests/test_elbow.java b/src/main/java/edu/uc/rphash/tests/test_elbow.java new file mode 100644 index 0000000..2b994fa --- /dev/null +++ b/src/main/java/edu/uc/rphash/tests/test_elbow.java @@ -0,0 +1,89 @@ +package edu.uc.rphash.tests; + +import java.util.ArrayList; +//import java.util.Arrays; +import java.util.List; + +import edu.uc.rphash.kneefinder.JythonTest; + + +public class test_elbow { + + + public static void main(String[] args){ + + + List counts = new ArrayList<>(50); + + double elbowdata[] = {5000, + 4000, + 3000, + 2000, + 1000, + 900, + 800, + 700, + 600, + 500, + 450, + 400, + 350, + 300, + 250, + 225, + 200, + 175, + 150, + 125, + 100, + 75, + 50, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 10, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 8, + } ; + + int size = elbowdata.length ; + + for (int i= 0;i maxGap){ + maxGap = gap; + minMax[0] = sorted[i-1]; + minMax[1] = sorted[i]; + } + } + return minMax; + } + + /** + * The euclidean distance between two n-d points (order doesn't matter). + * @param a Point a + * @param b Point b + * @return The euclidean distance between two points. + */ + public static double dist(double[] a, double[] b){ + return Math.sqrt(Maths.distSq(a,b)); + } + + /** + * Returns the euclidean distance squared between two n-d points. + * @param a Point a. + * @param b Point b. + * @return The euclidean distance squared between two points. + */ + public static double distSq(double[] a, double[] b){ + double distSq = 0; + for (int i = 0; i < a.length; i++) { + distSq += Math.pow(a[i] - b[i], 2); + } + return distSq; + } + + /** + * @param x The variable input into the function. + * @param height The height of the center of the curve (sometimes called 'a'). + * @param center The center of the curve (sometimes called 'b'). + * @param width The standard deviation, i.e ~68% of the data will be contained in center ± the width. + * @return A gaussian function. + */ + public static double gaussian(double x, double height, double center, double width){ + return height * Math.exp(-(x-center)*(x-center)/(2.0*width*width) ); + } + + public static long mean(long[] d){ + long total = 0; + for (long v : d) { + total += v; + } + return total/d.length; + } + + public static void shuffle(int[] array){ + Random rand = new Random(); + for (int i = array.length - 1; i > 0; i--) + { + int index = rand.nextInt(i + 1); + // Simple swap + int a = array[index]; + array[index] = array[i]; + array[i] = a; + } + } + + public static double mean(double[] d){ + double total = 0; + for (double v : d) { + total += v; + } + return total/d.length; + } + + public static double std(double[] data){ + double mean = mean(data); + double std = 0; + for (double d : data) { + double deviation = d - mean; + std += deviation * deviation; + } + std /= data.length; + return Math.sqrt(std); + } + + public static long[] absDeviationsFromMedian(long[] data){ + long median = median(data); + long[] deviations = new long[data.length]; + for (int i = 0; i < data.length; i++) { + deviations[i] = Math.abs(data[i] - median); + } + return deviations; + } + + /** + * Calculate the absolute deviations a sample has away from its median. + * @param data The data to determine median and deviations for. + * @return An array of absolute deviations away from the median. + */ + public static double[] absDeviationsFromMedian(double[] data){ + double median = median(data); + double[] deviations = new double[data.length]; + for (int i = 0; i < data.length; i++) { + deviations[i] = Math.abs(data[i] - median); + } + return deviations; + } + + /** + * Linearly interpolate resolve a starting point towards some ending point. + * @param startPt The point to start at. + * @param endPt The point to head towards. + * @param alpha A value of 0 ends at the start pt, a value of 1 ends at the end point, a value + * greater than 1 over shoots the end point but continues following that same + * direction, likewise, a negative value heads backwards resolve the starting point + * with the end point reachable in a straight line. + * @return The newly interpolated position. + */ + public static double[] lerp(double[] startPt, double[] endPt, double alpha){ + if(startPt.length != endPt.length){ + throw new IllegalArgumentException("Start and end must have equal lengths."); + } + //we use c as the direction, and then as the final output + double[] c = new double[startPt.length]; + for (int i = 0; i < startPt.length; i++) { + c[i] = startPt[i] + ( (endPt[i] - startPt[i]) * alpha ); + } + return c; + } + + /** + * Do an element-wise subtraction such that, result[i] = a[i] - b[i]. + * @param a array a + * @param b array b + * @return the resulting "subtracted" result[] array. + */ + public static double[] sub(double[] a, double[] b){ + if(a.length != b.length){ + throw new IllegalArgumentException("Array A and B must be the same length."); + } + double[] result = new double[a.length]; + for (int i = 0; i < a.length; i++) { + result[i] = a[i] - b[i]; + } + return result; + } + + /** + * Do an element-wise subtraction such that, result[i] = a[i] - b[i]. + * @param a array a + * @param b array b + * @return the resulting "subtracted" result[] array. + */ + public static int[] sub(int[] a, int[] b){ + if(a.length != b.length){ + throw new IllegalArgumentException("Array A and B must be the same length."); + } + int[] result = new int[a.length]; + for (int i = 0; i < a.length; i++) { + result[i] = a[i] - b[i]; + } + return result; + } + + /** + * @param a the array + * @return The maximum absolute element in the array 'a'. + */ + public static int maxAbsElement(double[] a){ + int max = (int) Math.abs(a[0]); + for (int i = 1; i < a.length; i++) { + int element = (int) Math.abs(a[i]); + if(element > max){ + max = element; + } + } + return max; + } + + /** + * Divide every element in array 'a' by a given scalar. + * @param a the array + * @param scalar the divisor + * @return The array such that, result[i] = a[i] / scalar + */ + public static double[] div(double[] a, double scalar){ + double[] result = new double[a.length]; + for (int i = 0; i < a.length; i++) { + result[i] = a[i] / scalar; + } + return result; + } + + /** + * Find the area of a triangle defined by three points: a,b,c. + * @param ax X coordinate of point a. + * @param ay Y coordinate of point a. + * @param bx X coordinate of point b. + * @param by Y coordinate of point b. + * @param cx X coordinate of point c. + * @param cy Y coordinate of point c. + * @return The area of the triangle. + */ + public static double triArea(double ax, double ay, + double bx, double by, + double cx, double cy){ + return Math.abs((ax - cx) * (by - ay) - (ax - bx) * (cy - ay)) * 0.5; + } + + public static double triArea3D(double ax, double ay, double az, + double bx, double by, double bz, + double cx, double cy, double cz) { + return 0.5 * Math.sqrt(dotSq(ax, ay, bx, by, cx, cy) + + dotSq(ax, az, bx, bz, cx, cz) + dotSq(ay, az, by, bz, cy, cz)); + } + + /** + * Returns the cross product of two 3d vectors. + * @param a 3d vector "a". + * @param b 3d vector "b". + * @return The cross product of a and b. In other words, the vector that is orthogonal to a and b. + */ + public static double[] cross3d(double[] a, double[] b){ + if(a.length != 3){ + throw new IllegalArgumentException("Vector a length must equal 3."); + } + if(b.length != 3){ + throw new IllegalArgumentException("Vector b length must equal 3."); + } + return new double[]{ + a[1]*b[2] - a[2]*b[1], + a[2]*b[0] - a[0]*b[2], + a[0]*b[1] - a[1]*b[0] + }; + } + + /** + * Dot vector "a" against vector "b". + * That is, if a = [a1,a2,...,an] and b = [b1,b2,...,bn] + * then a dot b = a1*b1 + a2*b2 + ... + an*bn. + * @param a Vector a. + * @param b Vector b. + * @return The result of a dot b. + */ + public static double dot(double[] a, double[] b){ + if(a.length != b.length){ + throw new IllegalArgumentException("Vector 'a' must have the same length as vector 'b'."); + } + double dotProduct = 0; + for (int i = 0; i < a.length; i++) { + dotProduct += a[i]*b[i]; + } + return dotProduct; + } + + /** + * Multiply every component in "a" by a scalar. + * @param a The vector "a". + * @param scalar The scalar. + * @return A new vector. Original "a" is not modified. + */ + public static double[] scale(double[] a, double scalar){ + double[] aPrime = new double[a.length]; + for (int i = 0; i < a.length; i++) { + aPrime[i] = a[i] * scalar; + } + return aPrime; + } + + /** + * Add vectors a and b together in a component-wise fashion. + * @param a Vector a. + * @param b Vector b. + * @return Return vector a plus vector b in a new vector. + */ + public static double[] add(double[] a, double[] b){ + if(a.length != b.length){ + throw new IllegalArgumentException("Vector a and b must have the same lengths."); + } + double[] c = new double[a.length]; + for (int i = 0; i < a.length; i++) { + c[i] = a[i] + b[i]; + } + return c; + } + + /** + * Get the perpendicular distance between a point and a line formed by two other points. + * @param start The start point of the line. + * @param end The end point of the line. + * @param otherPt The other point to get the perpendicular distance from. + * @return The perpendicular distance between the point and the line. + */ + public static double perpendicularDistance(double[] start, double[] end, double[] otherPt){ + if(start.length != end.length || start.length != otherPt.length){ + throw new IllegalArgumentException("Vectors must have equal lengths."); + } + double[] projectedPt = projectAlong(start, end, otherPt); + return Maths.dist(otherPt, projectedPt); + } + + /** + * Given a line formed by start and end points and some other point, + * project that other point onto the line. + * @param start The start point. + * @param end The end point. + * @param otherPt The other point. + * @return The point projected onto the line. + */ + public static double[] projectAlong(double[] start, double[] end, double[] otherPt){ + if(start.length != end.length || start.length != otherPt.length){ + throw new IllegalArgumentException("Vectors must have equal lengths."); + } + double[] ab = Maths.sub(otherPt,start); + double[] ac = Maths.sub(end,start); + double percentageAlong = Maths.dot(ab, ac) / Maths.dot(ac, ac); + double[] amountMovedAC = Maths.scale(ac, percentageAlong); + return Maths.add(start, amountMovedAC); + } + + private static double dotSq(double ax, double ay, double bx, double by, double cx, double cy) { + double dot = ax * by - ax * cy + bx * cy - bx * ay + cx * ay - cx * by; + return dot * dot; + } + + public static double median(double[] data){ + if(data.length == 0){ + return Double.NaN; + } + double[] d = new double[data.length]; + System.arraycopy(data, 0, d, 0, data.length); + Arrays.sort(d); + int len = d.length; + if(len == 1){ + return d[0]; + } + //even case + else if(len % 2 == 0){ + int midRightIdx = (d.length) / 2; + int midLeftIdx = midRightIdx - 1; + return (d[midRightIdx] + d[midLeftIdx]) / 2.0; + } + //odd case + else{ + int midIdx = (d.length - 1) / 2; + return d[midIdx]; + } + } + + public static long median(long[] data){ + if(data.length == 0){ + throw new IllegalArgumentException("Data must have at least one element to find median."); + } + long[] d = new long[data.length]; + System.arraycopy(data, 0, d, 0, data.length); + Arrays.sort(d); + int len = d.length; + if(len == 1){ + return d[0]; + } + //even case + else if(len % 2 == 0){ + int midRightIdx = (d.length) / 2; + int midLeftIdx = midRightIdx - 1; + return (long) ((d[midRightIdx] + d[midLeftIdx]) / 2.0); + } + //odd case + else{ + int midIdx = (d.length - 1) / 2; + return d[midIdx]; + } + + } + + public static double mode(double[] data){ + HashMap tally = new HashMap<>(); + + for (double v : data) { + int nOccurrences = tally.getOrDefault(v, 0) + 1; + tally.put(v, nOccurrences); + } + + Optional> modalOpt = + tally.entrySet().stream().max((o1, o2) -> Integer.compare(o1.getValue(), o2.getValue())); + + if(modalOpt.isPresent()){ + return modalOpt.get().getKey(); + } + + return Double.NaN; + } + + /** + * Smooth the data using a gaussian kernel. + * @param data The data to smooth. + * @param n The size of sliding window (i.e number of indices either side to sample). + * @return The smoothed version of the data. + */ + public static double[] gaussianSmooth(double[] data, int n){ + double[] smoothed = new double[data.length]; + + for (int i = 0; i < data.length; i++) { + int startIdx = Math.max(0, i - n); + int endIdx = Math.min(data.length - 1, i + n); + + double sumWeights = 0; + double sumIndexWeight = 0; + + for (int j = startIdx; j < endIdx + 1; j++) { + double indexScore = Math.abs(j - i)/(double)n; + double indexWeight = Maths.gaussian(indexScore, 1, 0, 1); + sumWeights += (indexWeight * data[j]); + sumIndexWeight += indexWeight; + } + smoothed[i] = sumWeights/sumIndexWeight; + } + return smoothed; + } + + /** + * Smooth the data using a gaussian kernel. + * @param data The data to smooth. + * @param w The size of sliding window (i.e number of indices either side to sample). + * @return The smoothed version of the data. + */ + public static double[][] gaussianSmooth2d(double[][] data, int w){ + final int dataSize = data.length; + + if(dataSize == 0){ + throw new IllegalArgumentException("Cannot smooth empty data."); + } + + final int nDims = data[0].length; + + if(nDims == 0){ + throw new IllegalArgumentException("Cannot smooth a data point with no values. " + + "Uniformly populate every entry in your data with 1 or more dimensions."); + } + + double[][] smoothed = new double[dataSize][nDims]; + + for (int i = 0; i < dataSize; i++) { + int startIdx = Math.max(0, i - w); + int endIdx = Math.min(dataSize - 1, i + w); + + double[] sumWeights = new double[nDims]; + double sumIndexWeight = 0; + + for (int j = startIdx; j < endIdx + 1; j++) { + double indexScore = Math.abs(j - i)/(double)w; + double indexWeight = Maths.gaussian(indexScore, 1, 0, 1); + + for (int n = 0; n < nDims; n++) { + sumWeights[n] += (indexWeight * data[j][n]); + } + sumIndexWeight += indexWeight; + } + + for (int n = 0; n < nDims; n++) { + smoothed[i][n] = sumWeights[n]/sumIndexWeight; + } + } + return smoothed; + } + + + + public static double[][] Smooth2d(double[][] data){ + // double linearInterp(double[] x, double[] y, double xi) + + int size = data.length; //50 + double x[] = new double[size]; + double xi[] = new double[size]; + double y[] = new double[size]; + double smooth_xy[][] =new double[size][2]; + + for ( int i=0 ; i<=size-1 ; i++) { + x[i] = data[(size-1)-i][1]; + y[i] = data[i][0]; + + } + + // return linear interpolation of (x,y) on xi + LinearInterpolator li = new LinearInterpolator(); + // + + PolynomialSplineFunction psf = li.interpolate(x,y); + + for ( int i=0 ; i<=size-1 ; i++) { + + smooth_xy[(size-1)-i][1]= x[i]; + + smooth_xy[i][0]= psf.value(x[i]); + + } + + + return smooth_xy; + + + } + + + + /** + * Normalise the 1d data using min-max normalisation. + * @see Wikipedia article about feature re-scaling. + * @param data The data to normalise. + * @return The new array containing the normalised data. + */ + public static double[] minmaxNormalise1d(double[] data){ + //find min and max value + double curMin = Double.POSITIVE_INFINITY; + double curMax = Double.NEGATIVE_INFINITY; + for (double v : data) { + if(v < curMin){ + curMin = v; + } + if(v > curMax){ + curMax = v; + } + } + + //normalise the data using min-max normalisation + //and also subtract each value from its normalised index + final double range = curMax - curMin; + double[] normalisedData = new double[data.length]; + + for (int i = 0; i < normalisedData.length; i++) { + normalisedData[i] = ((data[i] - curMin) / range); + } + return normalisedData; + } + + /** + * Performs min-max normalisation on n-dimensional data (as long as the dimensionality is uniform, that is, all data is 2d or all 3d etc.). + * @see Wikipedia article about feature re-scaling. + * @param data The data to normalised. + * @return A new normalised data-set. + */ + public static double[][] minmaxNormalise(double[][] data){ + + final int dataSize = data.length; + + if(dataSize == 0){ + throw new IllegalArgumentException("Cannot smooth empty data."); + } + + final int nDims = data[0].length; + + if(nDims == 0){ + throw new IllegalArgumentException("Cannot smooth a data point with no values. " + + "Uniformly populate every entry in your data with 1 or more dimensions."); + } + + //1) get min and max for each dimension of the data + + double[] minEachDim = new double[nDims]; + double[] maxEachDim = new double[nDims]; + for (int i = 0; i < nDims; i++) { + minEachDim[i] = Double.POSITIVE_INFINITY; + maxEachDim[i] = Double.NEGATIVE_INFINITY; + } + + for (double[] coords : data) { + for (int n = 0; n < nDims; n++) { + double v = coords[n]; + if (v < minEachDim[n]) { + minEachDim[n] = v; + } + if (v > maxEachDim[n]) { + maxEachDim[n] = v; + } + } + } + + //2) normalise the data using the min and max + double[] rangeEachDim = new double[nDims]; + for (int n = 0; n < nDims; n++) { + rangeEachDim[n] = maxEachDim[n] - minEachDim[n]; + } + + double[][] outputNormalised = new double[dataSize][nDims]; + for (int i = 0; i < dataSize; i++) { + for (int n = 0; n < nDims; n++) { + //normalising step + outputNormalised[i][n] = (data[i][n] - minEachDim[n]) / rangeEachDim[n]; + } + } + return outputNormalised; + } + +} \ No newline at end of file diff --git a/src/main/java/edu/uc/rphash/util/VectorUtil.java b/src/main/java/edu/uc/rphash/util/VectorUtil.java index 1d4ed98..a9f5c70 100644 --- a/src/main/java/edu/uc/rphash/util/VectorUtil.java +++ b/src/main/java/edu/uc/rphash/util/VectorUtil.java @@ -44,6 +44,17 @@ public static float distance(float[] x, float[] y) { dist += ((x[i] - y[i]) * (x[i] - y[i])); return (float) Math.sqrt(dist); } + + public static float distancesq(float[] x, float[] y) { + if (x.length < 1) + return Float.MAX_VALUE; + if (y.length < 1) + return Float.MAX_VALUE; + float dist = (x[0] - y[0]) * (x[0] - y[0]); + for (int i = 1; i < x.length; i++) + dist += ((x[i] - y[i]) * (x[i] - y[i])); + return (float)(dist); + } /** * Resturns the euclidean distance between a vector region {i-k} of x with a