From 4c794cdaea53811bcfe81dab59aa110ffedd7575 Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 03:47:08 -0700
Subject: [PATCH 01/13] commit model

---
 examples/example_nlp.py | 109 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 examples/example_nlp.py

diff --git a/examples/example_nlp.py b/examples/example_nlp.py
new file mode 100644
index 0000000..529c38f
--- /dev/null
+++ b/examples/example_nlp.py
@@ -0,0 +1,109 @@
+def make_count_model(**args):
+    
+    from keras.layers import Input, Flatten, Dense, Dropout, Reshape, multiply
+    from keras.regularizers import l2
+    from keras.models import Model
+    if args:logging.debug("receiving arguments {}".format(args))
+        
+    dense_layers = args.get('dense_layers', 3)
+    dense_units = args.get('dense_units', 50)
+    l2reg = args.get('l2reg', 0.001)
+    dropout = args.get('dropout', 0.001)
+    
+    n_codes = 77
+    n_sites = 81
+    n_counts = 2
+    
+    m_input = Input((n_codes, n_sites, n_counts))
+
+    m = m_input
+
+    m = Flatten()(m)
+    for _ in range( dense_layers ):
+        m = Dense( units = dense_units, activation='relu',
+                   kernel_regularizer=l2(l2reg)) (m)
+        m = Dropout(dropout)(m)
+
+    m_output = Dense( units=1, activation='sigmoid')(m)
+
+    model = Model(inputs=m_input, outputs=m_output)
+    return model
+
+
+def make_nlp_model(**args):
+    
+    from keras.layers import Embedding, Input, Dense, GRU, TimeDistributed, Dropout, Flatten, Reshape, Concatenate
+    from keras.regularizers import l2
+    from keras.models import Model
+
+    # Hyper parameter
+    rnn_units = args.get('rnn_units', 10)
+    embedding_dim = args.get('embedding_dim', 20)
+    l2_reg = args.get('l2_reg', 0.)
+    rec_do = args.get('rec_do', 0.)
+    dense_layers = args.get('dense_layers', 3)
+    dense_units = args.get('dense_units', 50)
+    site_units = args.get('site_units', 100)
+    do = args.get('do', 0.)
+    
+    # Constants
+    n_codes = 77
+    n_sites = 81
+    n_counts = 2
+    n_words = 30674
+    encode_sites = False
+    
+    # Word encoder model
+    words_input = Input(shape = ( None, ), dtype='int32')
+    words_embedding = Embedding(n_words, embedding_dim, mask_zero = True)(words_input)
+    words_gru = GRU(rnn_units, kernel_regularizer=l2(l2_reg), recurrent_dropout = rec_do)(words_embedding)
+    wordEncoder = Model(words_input, words_gru)
+    
+    # Full model
+    sent_input = Input(shape = (n_codes * n_sites, None), dtype='int32')
+    count_input = Input(shape = (n_codes, n_sites, 2, ), dtype='float32')
+    sent_encoded = TimeDistributed(wordEncoder)(sent_input)
+    sent_encoded_reshaped = Reshape(( n_codes , n_sites, rnn_units))(sent_encoded)
+    concat_counts_sent = Concatenate(axis=3)([sent_encoded_reshaped, count_input])
+    if encode_sites:
+        codes_reshaped = Reshape(( n_codes , n_sites * (rnn_units*n_counts)))(concat_counts_sent)
+        sites_encoded = TimeDistributed(Dense(site_units, activation = 'relu', kernel_regularizer=l2(l2_reg)))(codes_reshaped)
+        flat = Flatten()(sites_encoded)                                 
+    else:
+        flat = Flatten()(concat_counts_sent)
+    dense = flat
+    for _ in range(dense_layers):
+        dense = Dense( dense_units, activation='relu', kernel_regularizer=l2(l2_reg) )(dense)
+        dense = Dropout(do)(dense)          
+    preds = Dense(1, activation='sigmoid')(dense)
+    model = Model([sent_input, count_input], preds)
+    
+    return model
+    
+
+
+get_model = make_nlp_model
+
+PATH_DATA = '/storage/user/llayer/NNLO'
+
+def get_name():
+    return 'nlp'
+
+def get_train():
+
+    return [PATH_DATA + 'train_0.h5']
+
+def get_val():
+
+    return [PATH_DATA + 'test_0.h5']
+
+def get_features():
+    #return ('features', lambda x: x) ##example of data adaptor
+    return 'features'
+
+def get_labels():
+    return 'labels'
+
+
+
+

From eff87f779416d675f531f39363cf509642df0293 Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 03:54:19 -0700
Subject: [PATCH 02/13] add skopt dimensions

---
 examples/example_nlp.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/examples/example_nlp.py b/examples/example_nlp.py
index 529c38f..95cf7c3 100644
--- a/examples/example_nlp.py
+++ b/examples/example_nlp.py
@@ -35,6 +35,7 @@ def make_nlp_model(**args):
     from keras.layers import Embedding, Input, Dense, GRU, TimeDistributed, Dropout, Flatten, Reshape, Concatenate
     from keras.regularizers import l2
     from keras.models import Model
+    if args:logging.debug("receiving arguments {}".format(args))
 
     # Hyper parameter
     rnn_units = args.get('rnn_units', 10)
@@ -84,6 +85,18 @@ def make_nlp_model(**args):
 
 get_model = make_nlp_model
 
+from skopt.space import Real, Integer, Categorical
+get_model.parameter_range =     [
+    Real(        low=1e-3, high=0.1,  prior='log-uniform', name='do'            ),
+    Real(        low=1e-4, high=0.9,  prior="log-uniform", name='l2_reg'        ),
+    Integer(     low=5,    high=32,                        name='embedding_dim' ),
+    Integer(     low=5,    high=20,                        name='rnn_units'     ),
+    #Integer(     low=5,    high = 20,                      name = 'site_units'  ),
+    Integer(     low=1,    high=5,                         name='dense_layers'  ),
+    Integer(     low=10,   high=100,                       name='dense_units'   ),
+]
+
+
 PATH_DATA = '/storage/user/llayer/NNLO'
 
 def get_name():

From 080cab9666491cc78c3c960a3f29c26b11276ba3 Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 05:52:01 -0700
Subject: [PATCH 03/13] add first attempt of data frame class

---
 nnlo/train/data.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/nnlo/train/data.py b/nnlo/train/data.py
index 2d18cfd..e6f5b6a 100644
--- a/nnlo/train/data.py
+++ b/nnlo/train/data.py
@@ -1,6 +1,7 @@
 ### Data class and associated helper methods
 
 import numpy as np
+import pandas as pd
 import h5py
 import os
 import time
@@ -191,6 +192,98 @@ def load_data(self, in_file):
             Not implemented in base class; derived classes should implement this function"""
         raise NotImplementedError
 
+        
+        
+class FrameData(Data):
+    
+
+    def __init__(self, batch_size,
+                 feature_adaptor,
+                 cache=None,
+                 copy_command=None,
+                 preloading=0,,
+                 frame_name='frame', 
+                 labels_name='label'):
+        
+        """Initializes and stores names of feature and label datasets"""
+        super(H5Data, self).__init__(batch_size,cache,copy_command)
+        self.feature_adaptor = feature_adaptor
+        self.frame_name = frame_name
+        self.labels_name = labels_name
+        ## initialize the data-preloader
+        self.fpl = None
+        if preloading:
+            self.fpl = FilePreloader( [] , file_open = lambda n : h5py.File(n,'r'), n_ahead=preloading)
+            self.fpl.start()    
+        
+        
+    def load_frame(self, in_file_name):
+
+        if self.fpl:
+            h5_file = self.fpl.getFile( in_file_name )
+        else:
+            h5_file = h5py.File( in_file_name, 'r' )
+        
+        frame = h5_file[self.frame_name]
+        
+        if self.fpl:
+            self.fpl.closeFile( in_file_name )
+        else:
+            h5_file.close()
+            
+        return frame        
+           
+        
+    def count_data(self):
+
+        num_data = 0
+        for in_file_name in self.file_names:
+            h5_file = h5py.File( in_file_name, 'r' )
+            X = h5_file[frame_name]
+            num_data += len(X)
+            h5_file.close()
+        return num_data        
+
+    
+    def concat_data(self, data1, data2):
+
+        return pd.concat([data1, data2])
+
+        
+    def generate_data(self):
+
+        leftovers = None
+        for cur_file_name in self.file_names:
+            cur_frame = self.load_data(cur_file_name)
+            
+            # concatenate any leftover data from the previous file
+            if leftovers is not None:
+                cur_frame = self.concat_data( leftovers[0], cur_file_features )
+                leftovers = None
+                
+            num_in_file = len(frame)
+
+            for cur_pos in range(0, num_in_file, self.batch_size):
+                next_pos = cur_pos + self.batch_size 
+                if next_pos <= num_in_file:
+                yield ( self.get_batch( cur_frame, cur_pos, next_pos ), cur_frame[self.label].iloc[cur_pos : next_pos].value )
+                else:
+                    leftovers = cur_frame.iloc[cur_pos, num_in_file]   
+              
+            
+    def get_batch(self, cur_frame, start_pos, end_pos ):
+        
+        """ 
+        Convert the batch of the dataframe to a numpy array
+        with the provided function
+        """
+        
+        batch = cur_frame.iloc[start_pos : end_pos]
+        return self.feature_adaptor( batch )
+
+    
+        
+        
 class H5Data(Data):
     """Loads data stored in hdf5 files
         Attributes:

From b915bbf8bc9e9ff3371ca161044c9cc1170ec1be Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 05:55:53 -0700
Subject: [PATCH 04/13] update data frame class

---
 nnlo/train/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nnlo/train/data.py b/nnlo/train/data.py
index e6f5b6a..56603ab 100644
--- a/nnlo/train/data.py
+++ b/nnlo/train/data.py
@@ -217,7 +217,7 @@ def __init__(self, batch_size,
             self.fpl.start()    
         
         
-    def load_frame(self, in_file_name):
+    def load_data(self, in_file_name):
 
         if self.fpl:
             h5_file = self.fpl.getFile( in_file_name )
@@ -258,7 +258,7 @@ def generate_data(self):
             
             # concatenate any leftover data from the previous file
             if leftovers is not None:
-                cur_frame = self.concat_data( leftovers[0], cur_file_features )
+                cur_frame = self.concat_data( leftovers, cur_frame )
                 leftovers = None
                 
             num_in_file = len(frame)

From 9e33f72e3674fb9bd38c09642bbed0ae4af4c922 Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 07:01:10 -0700
Subject: [PATCH 05/13] batch generator

---
 examples/example_nlp.py | 116 ++++++++++++++++++++++++++++++++++------
 nnlo/train/data.py      |   3 +-
 2 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/examples/example_nlp.py b/examples/example_nlp.py
index 95cf7c3..a19ee25 100644
--- a/examples/example_nlp.py
+++ b/examples/example_nlp.py
@@ -1,3 +1,12 @@
+# Constants
+PATH_DATA = '/storage/user/llayer/NNLO'
+N_CODES = 77
+N_SITES = 81
+N_COUNTS = 2
+N_WORDS = 30674
+MAX_WORDS = 400
+
+
 def make_count_model(**args):
     
     from keras.layers import Input, Flatten, Dense, Dropout, Reshape, multiply
@@ -9,12 +18,9 @@ def make_count_model(**args):
     dense_units = args.get('dense_units', 50)
     l2reg = args.get('l2reg', 0.001)
     dropout = args.get('dropout', 0.001)
+   
     
-    n_codes = 77
-    n_sites = 81
-    n_counts = 2
-    
-    m_input = Input((n_codes, n_sites, n_counts))
+    m_input = Input((N_CODES, N_SITES, N_COUNTS))
 
     m = m_input
 
@@ -48,26 +54,22 @@ def make_nlp_model(**args):
     do = args.get('do', 0.)
     
     # Constants
-    n_codes = 77
-    n_sites = 81
-    n_counts = 2
-    n_words = 30674
     encode_sites = False
     
     # Word encoder model
     words_input = Input(shape = ( None, ), dtype='int32')
-    words_embedding = Embedding(n_words, embedding_dim, mask_zero = True)(words_input)
+    words_embedding = Embedding(N_WORDS, embedding_dim, mask_zero = True)(words_input)
     words_gru = GRU(rnn_units, kernel_regularizer=l2(l2_reg), recurrent_dropout = rec_do)(words_embedding)
     wordEncoder = Model(words_input, words_gru)
     
     # Full model
-    sent_input = Input(shape = (n_codes * n_sites, None), dtype='int32')
-    count_input = Input(shape = (n_codes, n_sites, 2, ), dtype='float32')
+    sent_input = Input(shape = (N_CODES * N_SITES, None), dtype='int32')
+    count_input = Input(shape = (N_CODES, N_SITES, 2, ), dtype='float32')
     sent_encoded = TimeDistributed(wordEncoder)(sent_input)
-    sent_encoded_reshaped = Reshape(( n_codes , n_sites, rnn_units))(sent_encoded)
+    sent_encoded_reshaped = Reshape(( N_CODES , N_SITES, rnn_units))(sent_encoded)
     concat_counts_sent = Concatenate(axis=3)([sent_encoded_reshaped, count_input])
     if encode_sites:
-        codes_reshaped = Reshape(( n_codes , n_sites * (rnn_units*n_counts)))(concat_counts_sent)
+        codes_reshaped = Reshape(( N_CODES , N_SITES * (rnn_units*N_COUNTS)))(concat_counts_sent)
         sites_encoded = TimeDistributed(Dense(site_units, activation = 'relu', kernel_regularizer=l2(l2_reg)))(codes_reshaped)
         flat = Flatten()(sites_encoded)                                 
     else:
@@ -80,11 +82,93 @@ def make_nlp_model(**args):
     model = Model([sent_input, count_input], preds)
     
     return model
-    
 
 
 get_model = make_nlp_model
 
+
+
+    
+import numpy as np
+
+# Dictionary to define the indexing for the codes and sites
+codes_dict = #pickle.load ... TODO
+sites_dict = #pickle.load ... TODO
+
+def to_dense(np_msg, np_counts, index, values):
+
+    errors, sites, counts, site_states, error_messages = values
+
+    # Loop over the codes and sites
+    for i_key in range(len(errors)):
+
+        error = errors[i_key]
+        site = sites[i_key]
+        count = counts[i_key]
+        site_state = site_states[i_key]
+
+        # Fill counts
+        if site_state == 'good':
+            site_state_encoded = 0
+        else:
+            site_state_encoded = 1
+        np_counts[index, codes_dict[error], sites_dict[site], site_state_encoded] += count
+
+        # Fill the error messages
+        error_message = error_messages[i_key]
+        # Only continue if there exists a message
+        if isinstance(error_message, (list,)):
+            
+            # Cut/Pad the error message
+            error_message = np.array(array)(error_message)
+            pad_size = np_msg.shape[3] - error_message.shape[0]
+            if pad_size < 0:
+                error_message = error_message[-np_msg.shape[3] : ]
+            else:
+                npad = (0, pad_size)
+                error_message = np.pad(error_message, pad_width=npad, mode='constant', constant_values=int(0))
+
+            #print( error_message )
+            np_msg[index, codes_dict[error], sites_dict[site]] = error_message    
+
+            
+def batch_generator( batch ):
+    
+    
+    batch_size = len(batch)
+    tokens_key = 'msg_encoded'
+    
+    # Loop over the messages to find the longest one
+    padding_dim = 1
+    for messages in batch[tokens_key]:
+        for msg in message:
+            if isinstance(msg, (list,)):
+                if len(msg) > padding_dim:
+                    padding_dim = len(msg)
+    
+    # Limit to the maximum number of words
+    if padding_dim > MAX_WORDS:
+        padding_dim = MAX_WORDS
+    
+    # Setup the numpy matrix
+    np_msg = np.zeros(batch_size, N_CODES, N_SITES, padding_dim, dtype=np.int32)
+    np_counts = np.zeros((batch_size, N_CODES, N_SITES, N_COUNTS), dtype=np.int32)
+    
+    # Fill the matrix
+    [to_dense(np_msg, np_counts, counter, values) for counter, values in enumerate(zip(batch['error'],
+                                                                                       batch['site'], 
+                                                                                       batch['count'],
+                                                                                       batch['site_state'], 
+                                                                                       batch[tokens_key]))]    
+    
+    # Reshape the error site matrix for the messages
+    np_msg = np_msg.reshape((batch_size, N_CODES * N_SITES, padding_dim))
+    
+    # Return the matrix
+    return [np_msg, np_counts]   
+    
+    
+
 from skopt.space import Real, Integer, Categorical
 get_model.parameter_range =     [
     Real(        low=1e-3, high=0.1,  prior='log-uniform', name='do'            ),
@@ -97,8 +181,6 @@ def make_nlp_model(**args):
 ]
 
 
-PATH_DATA = '/storage/user/llayer/NNLO'
-
 def get_name():
     return 'nlp'
 
diff --git a/nnlo/train/data.py b/nnlo/train/data.py
index 56603ab..0ba5e10 100644
--- a/nnlo/train/data.py
+++ b/nnlo/train/data.py
@@ -266,7 +266,8 @@ def generate_data(self):
             for cur_pos in range(0, num_in_file, self.batch_size):
                 next_pos = cur_pos + self.batch_size 
                 if next_pos <= num_in_file:
-                yield ( self.get_batch( cur_frame, cur_pos, next_pos ), cur_frame[self.label].iloc[cur_pos : next_pos].value )
+                yield ( self.get_batch( cur_frame, cur_pos, next_pos ), 
+                        cur_frame[self.labels_name].iloc[cur_pos : next_pos].value )
                 else:
                     leftovers = cur_frame.iloc[cur_pos, num_in_file]   
               

From f28f2c3928cd5506c8042a92abe4930e757bade0 Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 07:05:36 -0700
Subject: [PATCH 06/13] batch generator fix typo

---
 examples/example_nlp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/example_nlp.py b/examples/example_nlp.py
index a19ee25..6005d68 100644
--- a/examples/example_nlp.py
+++ b/examples/example_nlp.py
@@ -141,7 +141,7 @@ def batch_generator( batch ):
     # Loop over the messages to find the longest one
     padding_dim = 1
     for messages in batch[tokens_key]:
-        for msg in message:
+        for msg in messages:
             if isinstance(msg, (list,)):
                 if len(msg) > padding_dim:
                     padding_dim = len(msg)

From 5f27636e7077cd57208a547472334089f4a279da Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 08:01:14 -0700
Subject: [PATCH 07/13] fix small bugs

---
 nnlo/train/data.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/nnlo/train/data.py b/nnlo/train/data.py
index 0ba5e10..61d0553 100644
--- a/nnlo/train/data.py
+++ b/nnlo/train/data.py
@@ -201,7 +201,7 @@ def __init__(self, batch_size,
                  feature_adaptor,
                  cache=None,
                  copy_command=None,
-                 preloading=0,,
+                 preloading=0,
                  frame_name='frame', 
                  labels_name='label'):
         
@@ -239,7 +239,7 @@ def count_data(self):
         num_data = 0
         for in_file_name in self.file_names:
             h5_file = h5py.File( in_file_name, 'r' )
-            X = h5_file[frame_name]
+            X = h5_file[self.frame_name]
             num_data += len(X)
             h5_file.close()
         return num_data        
@@ -266,8 +266,8 @@ def generate_data(self):
             for cur_pos in range(0, num_in_file, self.batch_size):
                 next_pos = cur_pos + self.batch_size 
                 if next_pos <= num_in_file:
-                yield ( self.get_batch( cur_frame, cur_pos, next_pos ), 
-                        cur_frame[self.labels_name].iloc[cur_pos : next_pos].value )
+                    yield ( self.get_batch( cur_frame, cur_pos, next_pos ), 
+                            cur_frame[self.labels_name].iloc[cur_pos : next_pos].value )
                 else:
                     leftovers = cur_frame.iloc[cur_pos, num_in_file]   
               
@@ -283,6 +283,11 @@ def get_batch(self, cur_frame, start_pos, end_pos ):
         return self.feature_adaptor( batch )
 
     
+    def finalize(self):
+        if self.fpl:
+            self.fpl.stop()
+        Data.finalize(self)
+    
         
         
 class H5Data(Data):

From 43fcf5bc8564ceee4385e230b19998a674ee9152 Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 10:10:10 -0700
Subject: [PATCH 08/13] fix several typos and minor bugs

---
 examples/example_nlp.py | 26 ++++++++++++++------------
 nnlo/train/data.py      | 10 ++++++----
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/examples/example_nlp.py b/examples/example_nlp.py
index 6005d68..45aabbe 100644
--- a/examples/example_nlp.py
+++ b/examples/example_nlp.py
@@ -1,5 +1,5 @@
 # Constants
-PATH_DATA = '/storage/user/llayer/NNLO'
+PATH_DATA = '/storage/user/llayer/NNLO/'
 N_CODES = 77
 N_SITES = 81
 N_COUNTS = 2
@@ -90,10 +90,10 @@ def make_nlp_model(**args):
 
     
 import numpy as np
-
-# Dictionary to define the indexing for the codes and sites
-codes_dict = #pickle.load ... TODO
-sites_dict = #pickle.load ... TODO
+import pickle
+with open('/storage/user/llayer/NNLO/index.pickle', 'rb') as handle:
+    sites_dict = pickle.load(handle)
+    codes_dict = pickle.load(handle)
 
 def to_dense(np_msg, np_counts, index, values):
 
@@ -120,7 +120,7 @@ def to_dense(np_msg, np_counts, index, values):
         if isinstance(error_message, (list,)):
             
             # Cut/Pad the error message
-            error_message = np.array(array)(error_message)
+            error_message = np.array(error_message)
             pad_size = np_msg.shape[3] - error_message.shape[0]
             if pad_size < 0:
                 error_message = error_message[-np_msg.shape[3] : ]
@@ -151,7 +151,7 @@ def batch_generator( batch ):
         padding_dim = MAX_WORDS
     
     # Setup the numpy matrix
-    np_msg = np.zeros(batch_size, N_CODES, N_SITES, padding_dim, dtype=np.int32)
+    np_msg = np.zeros((batch_size, N_CODES, N_SITES, padding_dim), dtype=np.int32)
     np_counts = np.zeros((batch_size, N_CODES, N_SITES, N_COUNTS), dtype=np.int32)
     
     # Fill the matrix
@@ -181,23 +181,25 @@ def batch_generator( batch ):
 ]
 
 
+
+
+
 def get_name():
     return 'nlp'
 
 def get_train():
 
-    return [PATH_DATA + 'train_0.h5']
+    return [PATH_DATA + 'train_0.h5', PATH_DATA + 'train_1.h5', PATH_DATA + 'train_2.h5']
 
 def get_val():
 
-    return [PATH_DATA + 'test_0.h5']
+    return [PATH_DATA + 'test_0.h5', PATH_DATA + 'test_1.h5', PATH_DATA + 'test_2.h5']
 
 def get_features():
-    #return ('features', lambda x: x) ##example of data adaptor
-    return 'features'
+    return ('frame', batch_generator) ##example of data adaptor
 
 def get_labels():
-    return 'labels'
+    return 'label'
 
 
 
diff --git a/nnlo/train/data.py b/nnlo/train/data.py
index 61d0553..c9c8c3a 100644
--- a/nnlo/train/data.py
+++ b/nnlo/train/data.py
@@ -206,7 +206,7 @@ def __init__(self, batch_size,
                  labels_name='label'):
         
         """Initializes and stores names of feature and label datasets"""
-        super(H5Data, self).__init__(batch_size,cache,copy_command)
+        super(FrameData, self).__init__(batch_size,cache,copy_command)
         self.feature_adaptor = feature_adaptor
         self.frame_name = frame_name
         self.labels_name = labels_name
@@ -219,6 +219,7 @@ def __init__(self, batch_size,
         
     def load_data(self, in_file_name):
 
+        """
         if self.fpl:
             h5_file = self.fpl.getFile( in_file_name )
         else:
@@ -230,7 +231,8 @@ def load_data(self, in_file_name):
             self.fpl.closeFile( in_file_name )
         else:
             h5_file.close()
-            
+        """
+        frame = pd.read_hdf(in_file_name, 'frame')
         return frame        
            
         
@@ -261,13 +263,13 @@ def generate_data(self):
                 cur_frame = self.concat_data( leftovers, cur_frame )
                 leftovers = None
                 
-            num_in_file = len(frame)
+            num_in_file = len(cur_frame)
 
             for cur_pos in range(0, num_in_file, self.batch_size):
                 next_pos = cur_pos + self.batch_size 
                 if next_pos <= num_in_file:
                     yield ( self.get_batch( cur_frame, cur_pos, next_pos ), 
-                            cur_frame[self.labels_name].iloc[cur_pos : next_pos].value )
+                            cur_frame[self.labels_name].iloc[cur_pos : next_pos].values)
                 else:
                     leftovers = cur_frame.iloc[cur_pos, num_in_file]   
               

From 5d6e5016304c5e3d3b38dcb5dae7a47c59dee6af Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Fri, 18 Oct 2019 11:30:48 -0700
Subject: [PATCH 09/13] add fast check that the dense vector is correct

---
 examples/example_nlp.py | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/examples/example_nlp.py b/examples/example_nlp.py
index 45aabbe..5b1b09e 100644
--- a/examples/example_nlp.py
+++ b/examples/example_nlp.py
@@ -202,5 +202,40 @@ def get_labels():
     return 'label'
 
 
-
+if __name__ == "__main__":
+    
+    import pandas as pd
+    # Open a frame
+    path = PATH_DATA + 'train_0.h5'
+    frame = pd.read_hdf(path, 'frame')
+    print( frame.head() )
+    
+    # Get a batch
+    start = 0
+    batch_size = 2
+    batch = frame.iloc[start: start+batch_size]    
+    matrix = batch_generator( batch )
+    print( matrix[0].shape, matrix[1].shape )
+    matrix_msg = matrix[0].reshape((batch_size, N_CODES, N_SITES, matrix[0].shape[2]))
+    
+    # Fast check that the matrix is filled correctly
+    def print_sample( batch, index ):
+        
+        sample = batch.iloc[index]
+        errors = sample['error']
+        sites = sample['site']
+        message = sample['msg_encoded']
+        print( errors )
+        print( sites )
+        print( message )
+        
+        for i_key in range(len(errors)):
+            
+            print( 'Index error', errors[i_key], ':', codes_dict[errors[i_key]], 
+                   'Index site', sites[i_key], ':', sites_dict[sites[i_key]] )
+            print( 'Inserted in matrix' )
+            print( matrix_msg[index, codes_dict[errors[i_key]], sites_dict[sites[i_key]]] )
+            
+    print_sample( batch, 1 )
+    
 

From c07820c2350bd8a670eb419a4d6824d281fef48d Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@culture-plate-sm.hep.caltech.edu>
Date: Sat, 19 Oct 2019 05:12:40 -0700
Subject: [PATCH 10/13] first run

---
 TrainingDriver.py       | 30 ++++++++++++++++++++++++++----
 examples/example_nlp.py |  5 ++++-
 nnlo/train/data.py      | 12 +++++-------
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/TrainingDriver.py b/TrainingDriver.py
index cc8a654..7cc5ffb 100755
--- a/TrainingDriver.py
+++ b/TrainingDriver.py
@@ -15,7 +15,7 @@
 
 from nnlo.mpi.manager import MPIManager, get_device
 from nnlo.train.algo import Algo
-from nnlo.train.data import H5Data
+from nnlo.train.data import H5Data, FrameData
 from nnlo.train.model import ModelFromJson, ModelTensorFlow, ModelPytorch
 from nnlo.util.utils import import_keras
 from nnlo.util.timeline import Timeline
@@ -136,6 +136,8 @@ def add_train_options(parser):
     add_checkpoint_options(parser)
 
 def make_loader( args, features_name, labels_name, train_list):
+    
+    """
     data = H5Data( batch_size=args.batch,
                    cache = args.caching_dir,
                    copy_command = args.copy_command,                   
@@ -143,6 +145,17 @@ def make_loader( args, features_name, labels_name, train_list):
                    features_name=features_name,
                    labels_name=labels_name,
     )
+    """
+    
+    data = FrameData(batch_size=args.batch,
+                     feature_adaptor = features_name[1],
+                     cache = args.caching_dir,
+                     copy_command = args.copy_command,                   
+                     preloading = None, #args.data_preload,
+                     frame_name=features_name[0],
+                     labels_name=labels_name,
+                    )
+    
     # We initialize the Data object with the training data list
     # so that we can use it to count the number of training examples
     data.set_full_file_names( train_list )
@@ -230,13 +243,16 @@ def make_features_labels(m_module, args):
     args = parser.parse_args()    
     initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level)
 
+    
+    
+    
     a_backend = args.backend
     if 'torch' in args.model:
         a_backend = 'torch'
         
     m_module = __import__(args.model.replace('.py','').replace('/', '.'), fromlist=[None]) if '.py' in args.model else None
     (features_name, labels_name) = make_features_labels(m_module, args)
-    (train_list, val_list) = make_train_val_lists(m_module, args)
+    (train_list, val_list) = make_train_val_lists(m_module, args) 
     comm = MPI.COMM_WORLD.Dup()
 
     if args.timeline: Timeline.enable()
@@ -287,9 +303,15 @@ def make_features_labels(m_module, args):
 
         model_builder = ModelTensorFlow( comm, source=args.model, weights=model_weights)
 
-
+    
     data = make_loader(args, features_name, labels_name, train_list)
-
+    
+    #print( data )
+    #print( train_list )
+    #print( stop )
+    print( 'DATA', data.count_data() )
+    #print( stop )
+    
     # Some input arguments may be ignored depending on chosen algorithm
     algo = make_algo( args, use_tf, comm, validate_every=int(data.count_data()/args.batch ))
     
diff --git a/examples/example_nlp.py b/examples/example_nlp.py
index 5b1b09e..3bc3af4 100644
--- a/examples/example_nlp.py
+++ b/examples/example_nlp.py
@@ -1,5 +1,5 @@
 # Constants
-PATH_DATA = '/storage/user/llayer/NNLO/'
+PATH_DATA = '/storage/group/gpu/bigdata/CMSOpPred/'
 N_CODES = 77
 N_SITES = 81
 N_COUNTS = 2
@@ -204,6 +204,9 @@ def get_labels():
 
 if __name__ == "__main__":
     
+    model = get_model()
+    model.summary()
+    
     import pandas as pd
     # Open a frame
     path = PATH_DATA + 'train_0.h5'
diff --git a/nnlo/train/data.py b/nnlo/train/data.py
index c9c8c3a..1a51d3e 100644
--- a/nnlo/train/data.py
+++ b/nnlo/train/data.py
@@ -232,7 +232,7 @@ def load_data(self, in_file_name):
         else:
             h5_file.close()
         """
-        frame = pd.read_hdf(in_file_name, 'frame')
+        frame = pd.read_hdf(in_file_name, self.frame_name)
         return frame        
            
         
@@ -240,10 +240,8 @@ def count_data(self):
 
         num_data = 0
         for in_file_name in self.file_names:
-            h5_file = h5py.File( in_file_name, 'r' )
-            X = h5_file[self.frame_name]
-            num_data += len(X)
-            h5_file.close()
+            frame = pd.read_hdf(in_file_name, self.frame_name)
+            num_data += len(frame)
         return num_data        
 
     
@@ -271,7 +269,7 @@ def generate_data(self):
                     yield ( self.get_batch( cur_frame, cur_pos, next_pos ), 
                             cur_frame[self.labels_name].iloc[cur_pos : next_pos].values)
                 else:
-                    leftovers = cur_frame.iloc[cur_pos, num_in_file]   
+                    leftovers = cur_frame.iloc[cur_pos : num_in_file]   
               
             
     def get_batch(self, cur_frame, start_pos, end_pos ):
@@ -280,7 +278,7 @@ def get_batch(self, cur_frame, start_pos, end_pos ):
         Convert the batch of the dataframe to a numpy array
         with the provided function
         """
-        
+        #print( 'Gen batch' )
         batch = cur_frame.iloc[start_pos : end_pos]
         return self.feature_adaptor( batch )
 

From 015286a3a9f23fc1b7c3dc980d01dbf0fa2d9072 Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@flere-imsaho-sm.hep.caltech.edu>
Date: Tue, 22 Oct 2019 08:21:12 -0700
Subject: [PATCH 11/13] Add parser option for the dataframe loader

---
 TrainingDriver.py       | 44 +++++++++++++++++++----------------------
 examples/example_nlp.py |  1 -
 nnlo/train/data.py      | 35 +++++---------------------------
 3 files changed, 25 insertions(+), 55 deletions(-)

diff --git a/TrainingDriver.py b/TrainingDriver.py
index 7cc5ffb..3540486 100755
--- a/TrainingDriver.py
+++ b/TrainingDriver.py
@@ -57,6 +57,7 @@ def add_downpour_options(parser):
 
 
 def add_loader_options(parser):
+    parser.add_argument('--data-loader',help='Data loader to load the input files',default='h5py', dest='data_loader')
     parser.add_argument('--preload-data', help='Preload files as we read them', default=0, type=int, dest='data_preload')
     parser.add_argument('--cache-data', help='Cache the input files to a provided directory', default='', dest='caching_dir')
     parser.add_argument('--copy-command', help='Specific command line to copy the data into the cache. Expect a string with two {} first is the source (from input file list), second is the bare file name at destination. Like "cp {} {}"', default=None, dest='copy_command')
@@ -137,24 +138,25 @@ def add_train_options(parser):
 
 def make_loader( args, features_name, labels_name, train_list):
     
-    """
-    data = H5Data( batch_size=args.batch,
-                   cache = args.caching_dir,
-                   copy_command = args.copy_command,                   
-                   preloading = args.data_preload,
-                   features_name=features_name,
-                   labels_name=labels_name,
-    )
-    """
-    
-    data = FrameData(batch_size=args.batch,
-                     feature_adaptor = features_name[1],
-                     cache = args.caching_dir,
-                     copy_command = args.copy_command,                   
-                     preloading = None, #args.data_preload,
-                     frame_name=features_name[0],
-                     labels_name=labels_name,
-                    )
+    if 'dataframe' in args.data_loader:
+        
+        data = FrameData(batch_size=args.batch,
+                 feature_adaptor = features_name[1],
+                 cache = args.caching_dir,
+                 copy_command = args.copy_command,                   
+                 preloading = None, #args.data_preload,
+                 frame_name=features_name[0],
+                 labels_name=labels_name,
+                )
+    else:
+   
+        data = H5Data( batch_size=args.batch,
+                       cache = args.caching_dir,
+                       copy_command = args.copy_command,                   
+                       preloading = args.data_preload,
+                       features_name=features_name,
+                       labels_name=labels_name,
+        )
     
     # We initialize the Data object with the training data list
     # so that we can use it to count the number of training examples
@@ -306,12 +308,6 @@ def make_features_labels(m_module, args):
     
     data = make_loader(args, features_name, labels_name, train_list)
     
-    #print( data )
-    #print( train_list )
-    #print( stop )
-    print( 'DATA', data.count_data() )
-    #print( stop )
-    
     # Some input arguments may be ignored depending on chosen algorithm
     algo = make_algo( args, use_tf, comm, validate_every=int(data.count_data()/args.batch ))
     
diff --git a/examples/example_nlp.py b/examples/example_nlp.py
index 3bc3af4..fa150b7 100644
--- a/examples/example_nlp.py
+++ b/examples/example_nlp.py
@@ -134,7 +134,6 @@ def to_dense(np_msg, np_counts, index, values):
             
 def batch_generator( batch ):
     
-    
     batch_size = len(batch)
     tokens_key = 'msg_encoded'
     
diff --git a/nnlo/train/data.py b/nnlo/train/data.py
index 1a51d3e..bdd3b9b 100644
--- a/nnlo/train/data.py
+++ b/nnlo/train/data.py
@@ -195,8 +195,7 @@ def load_data(self, in_file):
         
         
 class FrameData(Data):
-    
-
+    """ Load pandas frame stored in hdf5 files """
     def __init__(self, batch_size,
                  feature_adaptor,
                  cache=None,
@@ -204,40 +203,19 @@ def __init__(self, batch_size,
                  preloading=0,
                  frame_name='frame', 
                  labels_name='label'):
-        
         """Initializes and stores names of feature and label datasets"""
         super(FrameData, self).__init__(batch_size,cache,copy_command)
         self.feature_adaptor = feature_adaptor
         self.frame_name = frame_name
         self.labels_name = labels_name
         ## initialize the data-preloader
-        self.fpl = None
-        if preloading:
-            self.fpl = FilePreloader( [] , file_open = lambda n : h5py.File(n,'r'), n_ahead=preloading)
-            self.fpl.start()    
-        
+        self.fpl = None   
         
     def load_data(self, in_file_name):
-
-        """
-        if self.fpl:
-            h5_file = self.fpl.getFile( in_file_name )
-        else:
-            h5_file = h5py.File( in_file_name, 'r' )
-        
-        frame = h5_file[self.frame_name]
-        
-        if self.fpl:
-            self.fpl.closeFile( in_file_name )
-        else:
-            h5_file.close()
-        """
         frame = pd.read_hdf(in_file_name, self.frame_name)
         return frame        
            
-        
     def count_data(self):
-
         num_data = 0
         for in_file_name in self.file_names:
             frame = pd.read_hdf(in_file_name, self.frame_name)
@@ -246,23 +224,21 @@ def count_data(self):
 
     
     def concat_data(self, data1, data2):
-
         return pd.concat([data1, data2])
 
         
     def generate_data(self):
-
+        """ 
+        Overwrite the the parent generate_data and adapt to pandas frames
+        """
         leftovers = None
         for cur_file_name in self.file_names:
             cur_frame = self.load_data(cur_file_name)
-            
             # concatenate any leftover data from the previous file
             if leftovers is not None:
                 cur_frame = self.concat_data( leftovers, cur_frame )
                 leftovers = None
-                
             num_in_file = len(cur_frame)
-
             for cur_pos in range(0, num_in_file, self.batch_size):
                 next_pos = cur_pos + self.batch_size 
                 if next_pos <= num_in_file:
@@ -273,7 +249,6 @@ def generate_data(self):
               
             
     def get_batch(self, cur_frame, start_pos, end_pos ):
-        
         """ 
         Convert the batch of the dataframe to a numpy array
         with the provided function

From 7cd435f9adee52d491171996d3d0d681cf4c82f0 Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@flere-imsaho-sm.hep.caltech.edu>
Date: Tue, 22 Oct 2019 08:23:01 -0700
Subject: [PATCH 12/13] fix spaces

---
 nnlo/train/data.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/nnlo/train/data.py b/nnlo/train/data.py
index bdd3b9b..1f8498f 100644
--- a/nnlo/train/data.py
+++ b/nnlo/train/data.py
@@ -222,11 +222,9 @@ def count_data(self):
             num_data += len(frame)
         return num_data        
 
-    
     def concat_data(self, data1, data2):
         return pd.concat([data1, data2])
 
-        
     def generate_data(self):
         """ 
         Overwrite the the parent generate_data and adapt to pandas frames
@@ -247,7 +245,6 @@ def generate_data(self):
                 else:
                     leftovers = cur_frame.iloc[cur_pos : num_in_file]   
               
-            
     def get_batch(self, cur_frame, start_pos, end_pos ):
         """ 
         Convert the batch of the dataframe to a numpy array
@@ -257,7 +254,6 @@ def get_batch(self, cur_frame, start_pos, end_pos ):
         batch = cur_frame.iloc[start_pos : end_pos]
         return self.feature_adaptor( batch )
 
-    
     def finalize(self):
         if self.fpl:
             self.fpl.stop()

From 57b94a525c8e436fbaf1dea2fd27028fa1bbfb7c Mon Sep 17 00:00:00 2001
From: Lukas Layer <llayer@cern.ch>
Date: Tue, 22 Oct 2019 20:18:45 +0200
Subject: [PATCH 13/13] Fix typo

---
 examples/example_nlp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/example_nlp.py b/examples/example_nlp.py
index fa150b7..b5e9b07 100644
--- a/examples/example_nlp.py
+++ b/examples/example_nlp.py
@@ -69,7 +69,7 @@ def make_nlp_model(**args):
     sent_encoded_reshaped = Reshape(( N_CODES , N_SITES, rnn_units))(sent_encoded)
     concat_counts_sent = Concatenate(axis=3)([sent_encoded_reshaped, count_input])
     if encode_sites:
-        codes_reshaped = Reshape(( N_CODES , N_SITES * (rnn_units*N_COUNTS)))(concat_counts_sent)
+        codes_reshaped = Reshape(( N_CODES , N_SITES * (rnn_units + N_COUNTS)))(concat_counts_sent)
         sites_encoded = TimeDistributed(Dense(site_units, activation = 'relu', kernel_regularizer=l2(l2_reg)))(codes_reshaped)
         flat = Flatten()(sites_encoded)                                 
     else: