From 4c794cdaea53811bcfe81dab59aa110ffedd7575 Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 03:47:08 -0700 Subject: [PATCH 01/13] commit model --- examples/example_nlp.py | 109 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 examples/example_nlp.py diff --git a/examples/example_nlp.py b/examples/example_nlp.py new file mode 100644 index 0000000..529c38f --- /dev/null +++ b/examples/example_nlp.py @@ -0,0 +1,109 @@ +def make_count_model(**args): + + from keras.layers import Input, Flatten, Dense, Dropout, Reshape, multiply + from keras.regularizers import l2 + from keras.models import Model + if args:logging.debug("receiving arguments {}".format(args)) + + dense_layers = args.get('dense_layers', 3) + dense_units = args.get('dense_units', 50) + l2reg = args.get('l2reg', 0.001) + dropout = args.get('dropout', 0.001) + + n_codes = 77 + n_sites = 81 + n_counts = 2 + + m_input = Input((n_codes, n_sites, n_counts)) + + m = m_input + + m = Flatten()(m) + for _ in range( dense_layers ): + m = Dense( units = dense_units, activation='relu', + kernel_regularizer=l2(l2reg)) (m) + m = Dropout(dropout)(m) + + m_output = Dense( units=1, activation='sigmoid')(m) + + model = Model(inputs=m_input, outputs=m_output) + return model + + +def make_nlp_model(**args): + + from keras.layers import Embedding, Input, Dense, GRU, TimeDistributed, Dropout, Flatten, Reshape, Concatenate + from keras.regularizers import l2 + from keras.models import Model + + # Hyper parameter + rnn_units = args.get('rnn_units', 10) + embedding_dim = args.get('embedding_dim', 20) + l2_reg = args.get('l2_reg', 0.) + rec_do = args.get('rec_do', 0.) + dense_layers = args.get('dense_layers', 3) + dense_units = args.get('dense_units', 50) + site_units = args.get('site_units', 100) + do = args.get('do', 0.) + + # Constants + n_codes = 77 + n_sites = 81 + n_counts = 2 + n_words = 30674 + encode_sites = False + + # Word encoder model + words_input = Input(shape = ( None, ), dtype='int32') + words_embedding = Embedding(n_words, embedding_dim, mask_zero = True)(words_input) + words_gru = GRU(rnn_units, kernel_regularizer=l2(l2_reg), recurrent_dropout = rec_do)(words_embedding) + wordEncoder = Model(words_input, words_gru) + + # Full model + sent_input = Input(shape = (n_codes * n_sites, None), dtype='int32') + count_input = Input(shape = (n_codes, n_sites, 2, ), dtype='float32') + sent_encoded = TimeDistributed(wordEncoder)(sent_input) + sent_encoded_reshaped = Reshape(( n_codes , n_sites, rnn_units))(sent_encoded) + concat_counts_sent = Concatenate(axis=3)([sent_encoded_reshaped, count_input]) + if encode_sites: + codes_reshaped = Reshape(( n_codes , n_sites * (rnn_units*n_counts)))(concat_counts_sent) + sites_encoded = TimeDistributed(Dense(site_units, activation = 'relu', kernel_regularizer=l2(l2_reg)))(codes_reshaped) + flat = Flatten()(sites_encoded) + else: + flat = Flatten()(concat_counts_sent) + dense = flat + for _ in range(dense_layers): + dense = Dense( dense_units, activation='relu', kernel_regularizer=l2(l2_reg) )(dense) + dense = Dropout(do)(dense) + preds = Dense(1, activation='sigmoid')(dense) + model = Model([sent_input, count_input], preds) + + return model + + + +get_model = make_nlp_model + +PATH_DATA = '/storage/user/llayer/NNLO' + +def get_name(): + return 'nlp' + +def get_train(): + + return [PATH_DATA + 'train_0.h5'] + +def get_val(): + + return [PATH_DATA + 'test_0.h5'] + +def get_features(): + #return ('features', lambda x: x) ##example of data adaptor + return 'features' + +def get_labels(): + return 'labels' + + + + From eff87f779416d675f531f39363cf509642df0293 Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 03:54:19 -0700 Subject: [PATCH 02/13] add skopt dimensions --- examples/example_nlp.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/example_nlp.py b/examples/example_nlp.py index 529c38f..95cf7c3 100644 --- a/examples/example_nlp.py +++ b/examples/example_nlp.py @@ -35,6 +35,7 @@ def make_nlp_model(**args): from keras.layers import Embedding, Input, Dense, GRU, TimeDistributed, Dropout, Flatten, Reshape, Concatenate from keras.regularizers import l2 from keras.models import Model + if args:logging.debug("receiving arguments {}".format(args)) # Hyper parameter rnn_units = args.get('rnn_units', 10) @@ -84,6 +85,18 @@ def make_nlp_model(**args): get_model = make_nlp_model +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Real( low=1e-3, high=0.1, prior='log-uniform', name='do' ), + Real( low=1e-4, high=0.9, prior="log-uniform", name='l2_reg' ), + Integer( low=5, high=32, name='embedding_dim' ), + Integer( low=5, high=20, name='rnn_units' ), + #Integer( low=5, high = 20, name = 'site_units' ), + Integer( low=1, high=5, name='dense_layers' ), + Integer( low=10, high=100, name='dense_units' ), +] + + PATH_DATA = '/storage/user/llayer/NNLO' def get_name(): From 080cab9666491cc78c3c960a3f29c26b11276ba3 Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 05:52:01 -0700 Subject: [PATCH 03/13] add first attempt of data frame class --- nnlo/train/data.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/nnlo/train/data.py b/nnlo/train/data.py index 2d18cfd..e6f5b6a 100644 --- a/nnlo/train/data.py +++ b/nnlo/train/data.py @@ -1,6 +1,7 @@ ### Data class and associated helper methods import numpy as np +import pandas as pd import h5py import os import time @@ -191,6 +192,98 @@ def load_data(self, in_file): Not implemented in base class; derived classes should implement this function""" raise NotImplementedError + + +class FrameData(Data): + + + def __init__(self, batch_size, + feature_adaptor, + cache=None, + copy_command=None, + preloading=0,, + frame_name='frame', + labels_name='label'): + + """Initializes and stores names of feature and label datasets""" + super(H5Data, self).__init__(batch_size,cache,copy_command) + self.feature_adaptor = feature_adaptor + self.frame_name = frame_name + self.labels_name = labels_name + ## initialize the data-preloader + self.fpl = None + if preloading: + self.fpl = FilePreloader( [] , file_open = lambda n : h5py.File(n,'r'), n_ahead=preloading) + self.fpl.start() + + + def load_frame(self, in_file_name): + + if self.fpl: + h5_file = self.fpl.getFile( in_file_name ) + else: + h5_file = h5py.File( in_file_name, 'r' ) + + frame = h5_file[self.frame_name] + + if self.fpl: + self.fpl.closeFile( in_file_name ) + else: + h5_file.close() + + return frame + + + def count_data(self): + + num_data = 0 + for in_file_name in self.file_names: + h5_file = h5py.File( in_file_name, 'r' ) + X = h5_file[frame_name] + num_data += len(X) + h5_file.close() + return num_data + + + def concat_data(self, data1, data2): + + return pd.concat([data1, data2]) + + + def generate_data(self): + + leftovers = None + for cur_file_name in self.file_names: + cur_frame = self.load_data(cur_file_name) + + # concatenate any leftover data from the previous file + if leftovers is not None: + cur_frame = self.concat_data( leftovers[0], cur_file_features ) + leftovers = None + + num_in_file = len(frame) + + for cur_pos in range(0, num_in_file, self.batch_size): + next_pos = cur_pos + self.batch_size + if next_pos <= num_in_file: + yield ( self.get_batch( cur_frame, cur_pos, next_pos ), cur_frame[self.label].iloc[cur_pos : next_pos].value ) + else: + leftovers = cur_frame.iloc[cur_pos, num_in_file] + + + def get_batch(self, cur_frame, start_pos, end_pos ): + + """ + Convert the batch of the dataframe to a numpy array + with the provided function + """ + + batch = cur_frame.iloc[start_pos : end_pos] + return self.feature_adaptor( batch ) + + + + class H5Data(Data): """Loads data stored in hdf5 files Attributes: From b915bbf8bc9e9ff3371ca161044c9cc1170ec1be Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 05:55:53 -0700 Subject: [PATCH 04/13] update data frame class --- nnlo/train/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nnlo/train/data.py b/nnlo/train/data.py index e6f5b6a..56603ab 100644 --- a/nnlo/train/data.py +++ b/nnlo/train/data.py @@ -217,7 +217,7 @@ def __init__(self, batch_size, self.fpl.start() - def load_frame(self, in_file_name): + def load_data(self, in_file_name): if self.fpl: h5_file = self.fpl.getFile( in_file_name ) @@ -258,7 +258,7 @@ def generate_data(self): # concatenate any leftover data from the previous file if leftovers is not None: - cur_frame = self.concat_data( leftovers[0], cur_file_features ) + cur_frame = self.concat_data( leftovers, cur_frame ) leftovers = None num_in_file = len(frame) From 9e33f72e3674fb9bd38c09642bbed0ae4af4c922 Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 07:01:10 -0700 Subject: [PATCH 05/13] batch generator --- examples/example_nlp.py | 116 ++++++++++++++++++++++++++++++++++------ nnlo/train/data.py | 3 +- 2 files changed, 101 insertions(+), 18 deletions(-) diff --git a/examples/example_nlp.py b/examples/example_nlp.py index 95cf7c3..a19ee25 100644 --- a/examples/example_nlp.py +++ b/examples/example_nlp.py @@ -1,3 +1,12 @@ +# Constants +PATH_DATA = '/storage/user/llayer/NNLO' +N_CODES = 77 +N_SITES = 81 +N_COUNTS = 2 +N_WORDS = 30674 +MAX_WORDS = 400 + + def make_count_model(**args): from keras.layers import Input, Flatten, Dense, Dropout, Reshape, multiply @@ -9,12 +18,9 @@ def make_count_model(**args): dense_units = args.get('dense_units', 50) l2reg = args.get('l2reg', 0.001) dropout = args.get('dropout', 0.001) + - n_codes = 77 - n_sites = 81 - n_counts = 2 - - m_input = Input((n_codes, n_sites, n_counts)) + m_input = Input((N_CODES, N_SITES, N_COUNTS)) m = m_input @@ -48,26 +54,22 @@ def make_nlp_model(**args): do = args.get('do', 0.) # Constants - n_codes = 77 - n_sites = 81 - n_counts = 2 - n_words = 30674 encode_sites = False # Word encoder model words_input = Input(shape = ( None, ), dtype='int32') - words_embedding = Embedding(n_words, embedding_dim, mask_zero = True)(words_input) + words_embedding = Embedding(N_WORDS, embedding_dim, mask_zero = True)(words_input) words_gru = GRU(rnn_units, kernel_regularizer=l2(l2_reg), recurrent_dropout = rec_do)(words_embedding) wordEncoder = Model(words_input, words_gru) # Full model - sent_input = Input(shape = (n_codes * n_sites, None), dtype='int32') - count_input = Input(shape = (n_codes, n_sites, 2, ), dtype='float32') + sent_input = Input(shape = (N_CODES * N_SITES, None), dtype='int32') + count_input = Input(shape = (N_CODES, N_SITES, 2, ), dtype='float32') sent_encoded = TimeDistributed(wordEncoder)(sent_input) - sent_encoded_reshaped = Reshape(( n_codes , n_sites, rnn_units))(sent_encoded) + sent_encoded_reshaped = Reshape(( N_CODES , N_SITES, rnn_units))(sent_encoded) concat_counts_sent = Concatenate(axis=3)([sent_encoded_reshaped, count_input]) if encode_sites: - codes_reshaped = Reshape(( n_codes , n_sites * (rnn_units*n_counts)))(concat_counts_sent) + codes_reshaped = Reshape(( N_CODES , N_SITES * (rnn_units*N_COUNTS)))(concat_counts_sent) sites_encoded = TimeDistributed(Dense(site_units, activation = 'relu', kernel_regularizer=l2(l2_reg)))(codes_reshaped) flat = Flatten()(sites_encoded) else: @@ -80,11 +82,93 @@ def make_nlp_model(**args): model = Model([sent_input, count_input], preds) return model - get_model = make_nlp_model + + + +import numpy as np + +# Dictionary to define the indexing for the codes and sites +codes_dict = #pickle.load ... TODO +sites_dict = #pickle.load ... TODO + +def to_dense(np_msg, np_counts, index, values): + + errors, sites, counts, site_states, error_messages = values + + # Loop over the codes and sites + for i_key in range(len(errors)): + + error = errors[i_key] + site = sites[i_key] + count = counts[i_key] + site_state = site_states[i_key] + + # Fill counts + if site_state == 'good': + site_state_encoded = 0 + else: + site_state_encoded = 1 + np_counts[index, codes_dict[error], sites_dict[site], site_state_encoded] += count + + # Fill the error messages + error_message = error_messages[i_key] + # Only continue if there exists a message + if isinstance(error_message, (list,)): + + # Cut/Pad the error message + error_message = np.array(array)(error_message) + pad_size = np_msg.shape[3] - error_message.shape[0] + if pad_size < 0: + error_message = error_message[-np_msg.shape[3] : ] + else: + npad = (0, pad_size) + error_message = np.pad(error_message, pad_width=npad, mode='constant', constant_values=int(0)) + + #print( error_message ) + np_msg[index, codes_dict[error], sites_dict[site]] = error_message + + +def batch_generator( batch ): + + + batch_size = len(batch) + tokens_key = 'msg_encoded' + + # Loop over the messages to find the longest one + padding_dim = 1 + for messages in batch[tokens_key]: + for msg in message: + if isinstance(msg, (list,)): + if len(msg) > padding_dim: + padding_dim = len(msg) + + # Limit to the maximum number of words + if padding_dim > MAX_WORDS: + padding_dim = MAX_WORDS + + # Setup the numpy matrix + np_msg = np.zeros(batch_size, N_CODES, N_SITES, padding_dim, dtype=np.int32) + np_counts = np.zeros((batch_size, N_CODES, N_SITES, N_COUNTS), dtype=np.int32) + + # Fill the matrix + [to_dense(np_msg, np_counts, counter, values) for counter, values in enumerate(zip(batch['error'], + batch['site'], + batch['count'], + batch['site_state'], + batch[tokens_key]))] + + # Reshape the error site matrix for the messages + np_msg = np_msg.reshape((batch_size, N_CODES * N_SITES, padding_dim)) + + # Return the matrix + return [np_msg, np_counts] + + + from skopt.space import Real, Integer, Categorical get_model.parameter_range = [ Real( low=1e-3, high=0.1, prior='log-uniform', name='do' ), @@ -97,8 +181,6 @@ def make_nlp_model(**args): ] -PATH_DATA = '/storage/user/llayer/NNLO' - def get_name(): return 'nlp' diff --git a/nnlo/train/data.py b/nnlo/train/data.py index 56603ab..0ba5e10 100644 --- a/nnlo/train/data.py +++ b/nnlo/train/data.py @@ -266,7 +266,8 @@ def generate_data(self): for cur_pos in range(0, num_in_file, self.batch_size): next_pos = cur_pos + self.batch_size if next_pos <= num_in_file: - yield ( self.get_batch( cur_frame, cur_pos, next_pos ), cur_frame[self.label].iloc[cur_pos : next_pos].value ) + yield ( self.get_batch( cur_frame, cur_pos, next_pos ), + cur_frame[self.labels_name].iloc[cur_pos : next_pos].value ) else: leftovers = cur_frame.iloc[cur_pos, num_in_file] From f28f2c3928cd5506c8042a92abe4930e757bade0 Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 07:05:36 -0700 Subject: [PATCH 06/13] batch generator fix typo --- examples/example_nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example_nlp.py b/examples/example_nlp.py index a19ee25..6005d68 100644 --- a/examples/example_nlp.py +++ b/examples/example_nlp.py @@ -141,7 +141,7 @@ def batch_generator( batch ): # Loop over the messages to find the longest one padding_dim = 1 for messages in batch[tokens_key]: - for msg in message: + for msg in messages: if isinstance(msg, (list,)): if len(msg) > padding_dim: padding_dim = len(msg) From 5f27636e7077cd57208a547472334089f4a279da Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 08:01:14 -0700 Subject: [PATCH 07/13] fix small bugs --- nnlo/train/data.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/nnlo/train/data.py b/nnlo/train/data.py index 0ba5e10..61d0553 100644 --- a/nnlo/train/data.py +++ b/nnlo/train/data.py @@ -201,7 +201,7 @@ def __init__(self, batch_size, feature_adaptor, cache=None, copy_command=None, - preloading=0,, + preloading=0, frame_name='frame', labels_name='label'): @@ -239,7 +239,7 @@ def count_data(self): num_data = 0 for in_file_name in self.file_names: h5_file = h5py.File( in_file_name, 'r' ) - X = h5_file[frame_name] + X = h5_file[self.frame_name] num_data += len(X) h5_file.close() return num_data @@ -266,8 +266,8 @@ def generate_data(self): for cur_pos in range(0, num_in_file, self.batch_size): next_pos = cur_pos + self.batch_size if next_pos <= num_in_file: - yield ( self.get_batch( cur_frame, cur_pos, next_pos ), - cur_frame[self.labels_name].iloc[cur_pos : next_pos].value ) + yield ( self.get_batch( cur_frame, cur_pos, next_pos ), + cur_frame[self.labels_name].iloc[cur_pos : next_pos].value ) else: leftovers = cur_frame.iloc[cur_pos, num_in_file] @@ -283,6 +283,11 @@ def get_batch(self, cur_frame, start_pos, end_pos ): return self.feature_adaptor( batch ) + def finalize(self): + if self.fpl: + self.fpl.stop() + Data.finalize(self) + class H5Data(Data): From 43fcf5bc8564ceee4385e230b19998a674ee9152 Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 10:10:10 -0700 Subject: [PATCH 08/13] fix several typos and minor bugs --- examples/example_nlp.py | 26 ++++++++++++++------------ nnlo/train/data.py | 10 ++++++---- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/examples/example_nlp.py b/examples/example_nlp.py index 6005d68..45aabbe 100644 --- a/examples/example_nlp.py +++ b/examples/example_nlp.py @@ -1,5 +1,5 @@ # Constants -PATH_DATA = '/storage/user/llayer/NNLO' +PATH_DATA = '/storage/user/llayer/NNLO/' N_CODES = 77 N_SITES = 81 N_COUNTS = 2 @@ -90,10 +90,10 @@ def make_nlp_model(**args): import numpy as np - -# Dictionary to define the indexing for the codes and sites -codes_dict = #pickle.load ... TODO -sites_dict = #pickle.load ... TODO +import pickle +with open('/storage/user/llayer/NNLO/index.pickle', 'rb') as handle: + sites_dict = pickle.load(handle) + codes_dict = pickle.load(handle) def to_dense(np_msg, np_counts, index, values): @@ -120,7 +120,7 @@ def to_dense(np_msg, np_counts, index, values): if isinstance(error_message, (list,)): # Cut/Pad the error message - error_message = np.array(array)(error_message) + error_message = np.array(error_message) pad_size = np_msg.shape[3] - error_message.shape[0] if pad_size < 0: error_message = error_message[-np_msg.shape[3] : ] @@ -151,7 +151,7 @@ def batch_generator( batch ): padding_dim = MAX_WORDS # Setup the numpy matrix - np_msg = np.zeros(batch_size, N_CODES, N_SITES, padding_dim, dtype=np.int32) + np_msg = np.zeros((batch_size, N_CODES, N_SITES, padding_dim), dtype=np.int32) np_counts = np.zeros((batch_size, N_CODES, N_SITES, N_COUNTS), dtype=np.int32) # Fill the matrix @@ -181,23 +181,25 @@ def batch_generator( batch ): ] + + + def get_name(): return 'nlp' def get_train(): - return [PATH_DATA + 'train_0.h5'] + return [PATH_DATA + 'train_0.h5', PATH_DATA + 'train_1.h5', PATH_DATA + 'train_2.h5'] def get_val(): - return [PATH_DATA + 'test_0.h5'] + return [PATH_DATA + 'test_0.h5', PATH_DATA + 'test_1.h5', PATH_DATA + 'test_2.h5'] def get_features(): - #return ('features', lambda x: x) ##example of data adaptor - return 'features' + return ('frame', batch_generator) ##example of data adaptor def get_labels(): - return 'labels' + return 'label' diff --git a/nnlo/train/data.py b/nnlo/train/data.py index 61d0553..c9c8c3a 100644 --- a/nnlo/train/data.py +++ b/nnlo/train/data.py @@ -206,7 +206,7 @@ def __init__(self, batch_size, labels_name='label'): """Initializes and stores names of feature and label datasets""" - super(H5Data, self).__init__(batch_size,cache,copy_command) + super(FrameData, self).__init__(batch_size,cache,copy_command) self.feature_adaptor = feature_adaptor self.frame_name = frame_name self.labels_name = labels_name @@ -219,6 +219,7 @@ def __init__(self, batch_size, def load_data(self, in_file_name): + """ if self.fpl: h5_file = self.fpl.getFile( in_file_name ) else: @@ -230,7 +231,8 @@ def load_data(self, in_file_name): self.fpl.closeFile( in_file_name ) else: h5_file.close() - + """ + frame = pd.read_hdf(in_file_name, 'frame') return frame @@ -261,13 +263,13 @@ def generate_data(self): cur_frame = self.concat_data( leftovers, cur_frame ) leftovers = None - num_in_file = len(frame) + num_in_file = len(cur_frame) for cur_pos in range(0, num_in_file, self.batch_size): next_pos = cur_pos + self.batch_size if next_pos <= num_in_file: yield ( self.get_batch( cur_frame, cur_pos, next_pos ), - cur_frame[self.labels_name].iloc[cur_pos : next_pos].value ) + cur_frame[self.labels_name].iloc[cur_pos : next_pos].values) else: leftovers = cur_frame.iloc[cur_pos, num_in_file] From 5d6e5016304c5e3d3b38dcb5dae7a47c59dee6af Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Fri, 18 Oct 2019 11:30:48 -0700 Subject: [PATCH 09/13] add fast check that the dense vector is correct --- examples/example_nlp.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/examples/example_nlp.py b/examples/example_nlp.py index 45aabbe..5b1b09e 100644 --- a/examples/example_nlp.py +++ b/examples/example_nlp.py @@ -202,5 +202,40 @@ def get_labels(): return 'label' - +if __name__ == "__main__": + + import pandas as pd + # Open a frame + path = PATH_DATA + 'train_0.h5' + frame = pd.read_hdf(path, 'frame') + print( frame.head() ) + + # Get a batch + start = 0 + batch_size = 2 + batch = frame.iloc[start: start+batch_size] + matrix = batch_generator( batch ) + print( matrix[0].shape, matrix[1].shape ) + matrix_msg = matrix[0].reshape((batch_size, N_CODES, N_SITES, matrix[0].shape[2])) + + # Fast check that the matrix is filled correctly + def print_sample( batch, index ): + + sample = batch.iloc[index] + errors = sample['error'] + sites = sample['site'] + message = sample['msg_encoded'] + print( errors ) + print( sites ) + print( message ) + + for i_key in range(len(errors)): + + print( 'Index error', errors[i_key], ':', codes_dict[errors[i_key]], + 'Index site', sites[i_key], ':', sites_dict[sites[i_key]] ) + print( 'Inserted in matrix' ) + print( matrix_msg[index, codes_dict[errors[i_key]], sites_dict[sites[i_key]]] ) + + print_sample( batch, 1 ) + From c07820c2350bd8a670eb419a4d6824d281fef48d Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Sat, 19 Oct 2019 05:12:40 -0700 Subject: [PATCH 10/13] first run --- TrainingDriver.py | 30 ++++++++++++++++++++++++++---- examples/example_nlp.py | 5 ++++- nnlo/train/data.py | 12 +++++------- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/TrainingDriver.py b/TrainingDriver.py index cc8a654..7cc5ffb 100755 --- a/TrainingDriver.py +++ b/TrainingDriver.py @@ -15,7 +15,7 @@ from nnlo.mpi.manager import MPIManager, get_device from nnlo.train.algo import Algo -from nnlo.train.data import H5Data +from nnlo.train.data import H5Data, FrameData from nnlo.train.model import ModelFromJson, ModelTensorFlow, ModelPytorch from nnlo.util.utils import import_keras from nnlo.util.timeline import Timeline @@ -136,6 +136,8 @@ def add_train_options(parser): add_checkpoint_options(parser) def make_loader( args, features_name, labels_name, train_list): + + """ data = H5Data( batch_size=args.batch, cache = args.caching_dir, copy_command = args.copy_command, @@ -143,6 +145,17 @@ def make_loader( args, features_name, labels_name, train_list): features_name=features_name, labels_name=labels_name, ) + """ + + data = FrameData(batch_size=args.batch, + feature_adaptor = features_name[1], + cache = args.caching_dir, + copy_command = args.copy_command, + preloading = None, #args.data_preload, + frame_name=features_name[0], + labels_name=labels_name, + ) + # We initialize the Data object with the training data list # so that we can use it to count the number of training examples data.set_full_file_names( train_list ) @@ -230,13 +243,16 @@ def make_features_labels(m_module, args): args = parser.parse_args() initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level) + + + a_backend = args.backend if 'torch' in args.model: a_backend = 'torch' m_module = __import__(args.model.replace('.py','').replace('/', '.'), fromlist=[None]) if '.py' in args.model else None (features_name, labels_name) = make_features_labels(m_module, args) - (train_list, val_list) = make_train_val_lists(m_module, args) + (train_list, val_list) = make_train_val_lists(m_module, args) comm = MPI.COMM_WORLD.Dup() if args.timeline: Timeline.enable() @@ -287,9 +303,15 @@ def make_features_labels(m_module, args): model_builder = ModelTensorFlow( comm, source=args.model, weights=model_weights) - + data = make_loader(args, features_name, labels_name, train_list) - + + #print( data ) + #print( train_list ) + #print( stop ) + print( 'DATA', data.count_data() ) + #print( stop ) + # Some input arguments may be ignored depending on chosen algorithm algo = make_algo( args, use_tf, comm, validate_every=int(data.count_data()/args.batch )) diff --git a/examples/example_nlp.py b/examples/example_nlp.py index 5b1b09e..3bc3af4 100644 --- a/examples/example_nlp.py +++ b/examples/example_nlp.py @@ -1,5 +1,5 @@ # Constants -PATH_DATA = '/storage/user/llayer/NNLO/' +PATH_DATA = '/storage/group/gpu/bigdata/CMSOpPred/' N_CODES = 77 N_SITES = 81 N_COUNTS = 2 @@ -204,6 +204,9 @@ def get_labels(): if __name__ == "__main__": + model = get_model() + model.summary() + import pandas as pd # Open a frame path = PATH_DATA + 'train_0.h5' diff --git a/nnlo/train/data.py b/nnlo/train/data.py index c9c8c3a..1a51d3e 100644 --- a/nnlo/train/data.py +++ b/nnlo/train/data.py @@ -232,7 +232,7 @@ def load_data(self, in_file_name): else: h5_file.close() """ - frame = pd.read_hdf(in_file_name, 'frame') + frame = pd.read_hdf(in_file_name, self.frame_name) return frame @@ -240,10 +240,8 @@ def count_data(self): num_data = 0 for in_file_name in self.file_names: - h5_file = h5py.File( in_file_name, 'r' ) - X = h5_file[self.frame_name] - num_data += len(X) - h5_file.close() + frame = pd.read_hdf(in_file_name, self.frame_name) + num_data += len(frame) return num_data @@ -271,7 +269,7 @@ def generate_data(self): yield ( self.get_batch( cur_frame, cur_pos, next_pos ), cur_frame[self.labels_name].iloc[cur_pos : next_pos].values) else: - leftovers = cur_frame.iloc[cur_pos, num_in_file] + leftovers = cur_frame.iloc[cur_pos : num_in_file] def get_batch(self, cur_frame, start_pos, end_pos ): @@ -280,7 +278,7 @@ def get_batch(self, cur_frame, start_pos, end_pos ): Convert the batch of the dataframe to a numpy array with the provided function """ - + #print( 'Gen batch' ) batch = cur_frame.iloc[start_pos : end_pos] return self.feature_adaptor( batch ) From 015286a3a9f23fc1b7c3dc980d01dbf0fa2d9072 Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Tue, 22 Oct 2019 08:21:12 -0700 Subject: [PATCH 11/13] Add parser option for the dataframe loader --- TrainingDriver.py | 44 +++++++++++++++++++---------------------- examples/example_nlp.py | 1 - nnlo/train/data.py | 35 +++++--------------------------- 3 files changed, 25 insertions(+), 55 deletions(-) diff --git a/TrainingDriver.py b/TrainingDriver.py index 7cc5ffb..3540486 100755 --- a/TrainingDriver.py +++ b/TrainingDriver.py @@ -57,6 +57,7 @@ def add_downpour_options(parser): def add_loader_options(parser): + parser.add_argument('--data-loader',help='Data loader to load the input files',default='h5py', dest='data_loader') parser.add_argument('--preload-data', help='Preload files as we read them', default=0, type=int, dest='data_preload') parser.add_argument('--cache-data', help='Cache the input files to a provided directory', default='', dest='caching_dir') parser.add_argument('--copy-command', help='Specific command line to copy the data into the cache. Expect a string with two {} first is the source (from input file list), second is the bare file name at destination. Like "cp {} {}"', default=None, dest='copy_command') @@ -137,24 +138,25 @@ def add_train_options(parser): def make_loader( args, features_name, labels_name, train_list): - """ - data = H5Data( batch_size=args.batch, - cache = args.caching_dir, - copy_command = args.copy_command, - preloading = args.data_preload, - features_name=features_name, - labels_name=labels_name, - ) - """ - - data = FrameData(batch_size=args.batch, - feature_adaptor = features_name[1], - cache = args.caching_dir, - copy_command = args.copy_command, - preloading = None, #args.data_preload, - frame_name=features_name[0], - labels_name=labels_name, - ) + if 'dataframe' in args.data_loader: + + data = FrameData(batch_size=args.batch, + feature_adaptor = features_name[1], + cache = args.caching_dir, + copy_command = args.copy_command, + preloading = None, #args.data_preload, + frame_name=features_name[0], + labels_name=labels_name, + ) + else: + + data = H5Data( batch_size=args.batch, + cache = args.caching_dir, + copy_command = args.copy_command, + preloading = args.data_preload, + features_name=features_name, + labels_name=labels_name, + ) # We initialize the Data object with the training data list # so that we can use it to count the number of training examples @@ -306,12 +308,6 @@ def make_features_labels(m_module, args): data = make_loader(args, features_name, labels_name, train_list) - #print( data ) - #print( train_list ) - #print( stop ) - print( 'DATA', data.count_data() ) - #print( stop ) - # Some input arguments may be ignored depending on chosen algorithm algo = make_algo( args, use_tf, comm, validate_every=int(data.count_data()/args.batch )) diff --git a/examples/example_nlp.py b/examples/example_nlp.py index 3bc3af4..fa150b7 100644 --- a/examples/example_nlp.py +++ b/examples/example_nlp.py @@ -134,7 +134,6 @@ def to_dense(np_msg, np_counts, index, values): def batch_generator( batch ): - batch_size = len(batch) tokens_key = 'msg_encoded' diff --git a/nnlo/train/data.py b/nnlo/train/data.py index 1a51d3e..bdd3b9b 100644 --- a/nnlo/train/data.py +++ b/nnlo/train/data.py @@ -195,8 +195,7 @@ def load_data(self, in_file): class FrameData(Data): - - + """ Load pandas frame stored in hdf5 files """ def __init__(self, batch_size, feature_adaptor, cache=None, @@ -204,40 +203,19 @@ def __init__(self, batch_size, preloading=0, frame_name='frame', labels_name='label'): - """Initializes and stores names of feature and label datasets""" super(FrameData, self).__init__(batch_size,cache,copy_command) self.feature_adaptor = feature_adaptor self.frame_name = frame_name self.labels_name = labels_name ## initialize the data-preloader - self.fpl = None - if preloading: - self.fpl = FilePreloader( [] , file_open = lambda n : h5py.File(n,'r'), n_ahead=preloading) - self.fpl.start() - + self.fpl = None def load_data(self, in_file_name): - - """ - if self.fpl: - h5_file = self.fpl.getFile( in_file_name ) - else: - h5_file = h5py.File( in_file_name, 'r' ) - - frame = h5_file[self.frame_name] - - if self.fpl: - self.fpl.closeFile( in_file_name ) - else: - h5_file.close() - """ frame = pd.read_hdf(in_file_name, self.frame_name) return frame - def count_data(self): - num_data = 0 for in_file_name in self.file_names: frame = pd.read_hdf(in_file_name, self.frame_name) @@ -246,23 +224,21 @@ def count_data(self): def concat_data(self, data1, data2): - return pd.concat([data1, data2]) def generate_data(self): - + """ + Overwrite the the parent generate_data and adapt to pandas frames + """ leftovers = None for cur_file_name in self.file_names: cur_frame = self.load_data(cur_file_name) - # concatenate any leftover data from the previous file if leftovers is not None: cur_frame = self.concat_data( leftovers, cur_frame ) leftovers = None - num_in_file = len(cur_frame) - for cur_pos in range(0, num_in_file, self.batch_size): next_pos = cur_pos + self.batch_size if next_pos <= num_in_file: @@ -273,7 +249,6 @@ def generate_data(self): def get_batch(self, cur_frame, start_pos, end_pos ): - """ Convert the batch of the dataframe to a numpy array with the provided function From 7cd435f9adee52d491171996d3d0d681cf4c82f0 Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Tue, 22 Oct 2019 08:23:01 -0700 Subject: [PATCH 12/13] fix spaces --- nnlo/train/data.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/nnlo/train/data.py b/nnlo/train/data.py index bdd3b9b..1f8498f 100644 --- a/nnlo/train/data.py +++ b/nnlo/train/data.py @@ -222,11 +222,9 @@ def count_data(self): num_data += len(frame) return num_data - def concat_data(self, data1, data2): return pd.concat([data1, data2]) - def generate_data(self): """ Overwrite the the parent generate_data and adapt to pandas frames @@ -247,7 +245,6 @@ def generate_data(self): else: leftovers = cur_frame.iloc[cur_pos : num_in_file] - def get_batch(self, cur_frame, start_pos, end_pos ): """ Convert the batch of the dataframe to a numpy array @@ -257,7 +254,6 @@ def get_batch(self, cur_frame, start_pos, end_pos ): batch = cur_frame.iloc[start_pos : end_pos] return self.feature_adaptor( batch ) - def finalize(self): if self.fpl: self.fpl.stop() From 57b94a525c8e436fbaf1dea2fd27028fa1bbfb7c Mon Sep 17 00:00:00 2001 From: Lukas Layer Date: Tue, 22 Oct 2019 20:18:45 +0200 Subject: [PATCH 13/13] Fix typo --- examples/example_nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example_nlp.py b/examples/example_nlp.py index fa150b7..b5e9b07 100644 --- a/examples/example_nlp.py +++ b/examples/example_nlp.py @@ -69,7 +69,7 @@ def make_nlp_model(**args): sent_encoded_reshaped = Reshape(( N_CODES , N_SITES, rnn_units))(sent_encoded) concat_counts_sent = Concatenate(axis=3)([sent_encoded_reshaped, count_input]) if encode_sites: - codes_reshaped = Reshape(( N_CODES , N_SITES * (rnn_units*N_COUNTS)))(concat_counts_sent) + codes_reshaped = Reshape(( N_CODES , N_SITES * (rnn_units + N_COUNTS)))(concat_counts_sent) sites_encoded = TimeDistributed(Dense(site_units, activation = 'relu', kernel_regularizer=l2(l2_reg)))(codes_reshaped) flat = Flatten()(sites_encoded) else: