diff --git a/.gitignore b/.gitignore index abe4a4a..a52bb86 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ *.pyc *.swp -*.json *.h5 -*.txt -.DS_Store \ No newline at end of file +.DS_Store +build/ +dist/ +nnlo.egg-info/ +publish.sh +.local/ diff --git a/models/__init__.py b/HISTORY.md similarity index 100% rename from models/__init__.py rename to HISTORY.md diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 88445f8..168cc4c 100644 --- a/README.md +++ b/README.md @@ -10,36 +10,38 @@ The original package was implemented by [Dustin Anderson](https://github.com/dua Test with the MNIST dataset, with keras+tensorflow ``` -git clone https://github.com/vlimant/NNLO.git +pip install nnlo cd NNLO ``` Example with mnist provided in a python file ``` -python3 models/get_mnist.py -mpirun -np 3 --tag-output python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 3 -mpirun -np 3 --tag-output python3 TrainingDriver.py --model examples/example_mnist_torch.py --loss categorical_crossentropy --epochs 3 +GetData mnist +mpirun -np 3 TrainingDriver --model mnist --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_mnist.list --val_data /path/to/test_mnist.list +mpirun -np 3 python TrainingDriver.py --model examples/example_mnist_torch.py --loss categorical_crossentropy --epochs 3 +jsrun -n 3 -g 1 TrainingDriver --model mnist --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_mnist.list --val_data /path/to/test_mnist.list ``` Example with the cifar10 with model json ``` -python3 models/BuildModel.py cifar10 +GetData cifar10 python3 models/get_cifar10.py -mpirun -np 3 --tag-output python3 TrainingDriver.py --model cifar10_arch.json --train train_cifar10.list --val test_cifar10.list --loss categorical_crossentropy --epochs 5 +mpirun -np 3 TrainingDriver --model cifar10 --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_cifar10.list --val_data /path/to/test_cifar10.list +jsrun -n 3 -g 1 TrainingDriver --model cifar10 --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_cifar10.list --val_data /path/to/test_cifar10.list ``` Example of training mnist with 2 workers, each with 2 process per Horovod ring ``` -mpirun -np 5 --tag-output python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 3 --n-processes 2 +mpirun -np 5 python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 3 --n-processes 2 ``` Example of training mnist with early stopping ``` -mpirun -np 3 --tag-output python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 10000 --early "val_loss,~<,4" +mpirun -np 3 python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 10000 --early "val_loss,~<,4" ``` Example of training with a fixed target ``` -mpirun -np 3 --tag-output python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 10000 --target-metric "val_acc,>,0.97" +mpirun -np 3 python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 10000 --target-metric "val_acc,>,0.97" ``` ## GAN Examples (experimental) diff --git a/examples/example_mnist.py b/examples/example_mnist.py deleted file mode 100644 index a6b11bd..0000000 --- a/examples/example_mnist.py +++ /dev/null @@ -1,39 +0,0 @@ -from models.Models import make_mnist_model - -get_model = make_mnist_model -def get_name(): - return 'mnist' - -def get_all(): - import socket,os,glob - host = os.environ.get('HOST',os.environ.get('HOSTNAME',socket.gethostname())) - - if 'daint' in host: - all_list = glob.glob('/scratch/snx3000/vlimant/data/mnist/*.h5') - elif 'titan' in host: - all_list = glob.glob('/ccs/proj/csc291/DATA/mnist/*.h5') - else: - all_list = glob.glob('/bigdata/shared/mnist/*.h5') - if not all_list: - all_list = glob.glob('mnist_*.h5') - return all_list - -def get_train(): - all_list = get_all() - l = int( len(all_list)*0.70) - train_list = all_list[:l] - return train_list - -def get_val(): - all_list = get_all() - l = int( len(all_list)*0.70) - val_list = all_list[l:] - return val_list - -def get_features(): - #return ('features', lambda x: x) ##example of data adaptor - return 'features' - -def get_labels(): - return 'labels' - diff --git a/examples/example_mnist_torch.py b/examples/example_mnist_torch.py deleted file mode 100644 index af916c9..0000000 --- a/examples/example_mnist_torch.py +++ /dev/null @@ -1,5 +0,0 @@ -from models.Models import make_mnist_torch_model -from examples.example_mnist import * - -get_model = make_mnist_torch_model - diff --git a/models/BuildModel.py b/models/BuildModel.py deleted file mode 100644 index 1529ca7..0000000 --- a/models/BuildModel.py +++ /dev/null @@ -1,50 +0,0 @@ -### Builds one of the available models. -# Saves model architecture to _arch.json -# and model weights to _weights.h5 - -import os -os.environ['CUDA_VISIBLE_DEVICES']="" -import argparse - -from Models import make_model - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('model_name', help='model to construct') - parser.add_argument('model_args', nargs='*', help='key=value to pass to the model',default=[]) - args = parser.parse_args() - model_name = args.model_name - model_args = {} - for kw in args.model_args: - k,v = kw.split('=') - try: - v = int(v) - except: - v= float(v) - model_args[k] = v - if model_args: - print ("passing",model_args,"to the model builder") - model = make_model( model_name ,**model_args) - else: - model = make_model( model_name) - weights_filename = "%s_weights.h5" % model_name - arch_filename = "%s_arch.json" % model_name - - if not "torch" in model_name: - model.summary() - model.save_weights( weights_filename, overwrite=True ) - print ("Saved model weights to {0}".format(weights_filename)) - - model_arch = model.to_json() - with open( arch_filename, 'w' ) as arch_file: - arch_file.write( model_arch ) - print ("Saved model architecture to {0}".format(arch_filename)) - else: - import torch - weights_filename = weights_filename.replace('h5','torch') - arch_filename = arch_filename.replace('json','torch') - torch.save(model.state_dict(), weights_filename) - print ("Saved model weights to {0}".format(weights_filename)) - torch.save(model, arch_filename) - print ("Saved model architecture to {0}".format(arch_filename)) - diff --git a/models/Models.py b/models/Models.py deleted file mode 100644 index 734f784..0000000 --- a/models/Models.py +++ /dev/null @@ -1,247 +0,0 @@ -### Predefined Keras models - -import sys -import logging - -def model_function(model_name): - """Constructs the Keras model indicated by model_name""" - model_maker_dict = { - 'example':make_example_model, - 'mnist':make_mnist_model, - 'cifar10':make_cifar10_model, - 'mnist_torch':make_mnist_torch_model, - 'topclass': make_topclass_model, - 'topclass_torch':make_topclass_torch_model - - } - return model_maker_dict[model_name] -def make_model(model_name, **args): - m_fn = model_function(model_name) - if args and hasattr(m_fn,'parameter_range'): - provided = set(args.keys()) - accepted = set([a.name for a in m_fn.parameter_range]) - if not provided.issubset( accepted ): - logging.error("provided arguments {} do not match the accepted ones {}".format(sorted(provided),sorted(accepted))) - sys.exit(-1) - return model_function(model_name)(**args) - -def make_example_model(): - """Example model from keras documentation""" - from keras.models import Sequential - from keras.layers import Dense, Activation - model = Sequential() - model.add(Dense(output_dim=64, input_dim=100)) - model.add(Activation("relu")) - model.add(Dense(output_dim=10)) - model.add(Activation("softmax")) - return model - -def make_topclass_model(**args): - from keras.models import Sequential, Model - from keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from keras.layers import Convolution2D, MaxPooling2D, Conv2D - if args:logging.debug("receiving arguments {}".format(args)) - conv_layers=args.get('conv_layers',2) - dense_layers=args.get('dense_layers',2) - dropout=args.get('dropout',0.2) - kernel = args.get('kernel_size',3) - classes=3 - in_channels=5 - in_ch = in_channels - ## the trace in the input file is 750, 150, 94, 5 - input = Input( (150,94,in_ch)) - ## convs - c = input - for i in range(conv_layers): - channel_in = in_ch*((i+1)%5) - channel_out = in_ch*((i+2)%5) - if channel_in == 0: channel_in += 1 - if channel_out == 0: channel_out += 1 - c = Conv2D( filters=channel_out, kernel_size=(kernel,kernel) , strides=1, padding="same", activation = 'relu') (c) - c = Conv2D(1, (kernel,kernel), activation = 'relu',strides=2, padding="same")(c) - - ## pooling - pool = args.get('pool', 10) - m = MaxPooling2D((pool,pool))(c) - f = Flatten()(m) - d = f - base = args.get('hidden_factor',5)*100 - for i in range(dense_layers): - N = int(base//(2**(i+1))) - d = Dense( N, activation='relu')(d) - if dropout: - d = Dropout(dropout)(d) - o = Dense(classes, activation='softmax')(d) - - model = Model(inputs=input, outputs=o) - #model.summary() - return model - -def make_cifar10_model(**args): - from keras.models import Sequential, Model - from keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from keras.layers import Convolution2D, MaxPooling2D, Conv2D - import keras.backend as K - if args:logging.debug("receiving arguments {}".format(args)) - nb_classes = 10 - img_rows, img_cols = 32, 32 - - # use 1 kernel size for all convolutional layers - ks = args.get('kernel_size', 3) - - # tune the number of filters for each convolution layer - nb_filters1 = args.get('nb_filters1', 48) - nb_filters2 = args.get('nb_filters2', 96) - nb_filters3 = args.get('nb_filters3', 192) - - # tune the pool size once - ps = args.get('pool_size', 2) - pool_size = (ps,ps) - - # tune the dropout rates independently - do4 = args.get('dropout1', 0.25) - do5 = args.get('dropout2', 0.5) - - # tune the dense layers independently - dense1 = args.get('dense1', 512) - dense2 = args.get('dense2', 256) - - if K.image_dim_ordering() == 'th': - input_shape = (3, img_rows, img_cols) - else: - input_shape = (img_rows, img_cols, 3) - - #act = 'sigmoid' - act = 'relu' - - i = Input( input_shape) - l = Conv2D(nb_filters1,( ks, ks), padding='same', activation = act)(i) - l = MaxPooling2D(pool_size=pool_size)(l) - #l = Dropout(do1)(l) - - l = Conv2D(nb_filters2, (ks, ks), padding='same',activation=act)(l) - #l = Conv2D(nb_filters2, (ks, ks))(l) - l = MaxPooling2D(pool_size=pool_size)(l) - #l = Dropout(do2)(l) - - l = Conv2D(nb_filters3, (ks, ks), padding='same',activation=act)(l) - #l = Conv2D(nb_filters3, (ks, ks))(l) - l = MaxPooling2D(pool_size=pool_size)(l) - #l = Dropout(do3)(l) - - l = Flatten()(l) - l = Dense(dense1,activation=act)(l) - l = Dropout(do4)(l) - l = Dense(dense2,activation=act)(l) - l =Dropout(do5)(l) - - o = Dense(nb_classes, activation='softmax')(l) - - model = Model(inputs=i, outputs=o) - #model.summary() - - return model - -def make_mnist_model(**args): - from keras.models import Sequential, Model - from keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from keras.layers import Convolution2D, MaxPooling2D, Conv2D - import keras.backend as K - """MNIST ConvNet from keras/examples/mnist_cnn.py""" - #np.random.seed(1337) # for reproducibility - if args:logging.debug("receiving arguments {}".format(args)) - nb_classes = 10 - # input image dimensions - img_rows, img_cols = 28, 28 - # number of convolutional filters to use - nb_filters = args.get('nb_filters',32) - # size of pooling area for max pooling - ps = args.get('pool_size',2) - - # convolution kernel size - ks = args.get('kernel_size',3) - do = args.get('dropout', 0.25) - dense = args.get('dense', 128) - - pool_size = (ps,ps) - if K.image_dim_ordering() == 'th': - input_shape = (1, img_rows, img_cols) - else: - input_shape = (img_rows, img_cols, 1) - model = Sequential() - model.add(Convolution2D(nb_filters, (ks, ks), - border_mode='valid', - input_shape=input_shape)) - model.add(Activation('relu')) - model.add(Convolution2D(nb_filters, (ks, ks))) - model.add(Activation('relu')) - model.add(MaxPooling2D(pool_size=pool_size)) - model.add(Dropout(do)) - model.add(Flatten()) - model.add(Dense(dense)) - model.add(Activation('relu')) - model.add(Dropout(do)) - model.add(Dense(nb_classes)) - model.add(Activation('softmax')) - return model - -def make_mnist_torch_model(**args): - if args:logging.debug("receiving arguments {}".format(args)) - try: - from TorchModels import MNistNet - except: - from .TorchModels import MNistNet - model = MNistNet(**args) - return model - -def make_topclass_torch_model(**args): - if args:logging.debug("receiving arguments {}".format(args)) - conv_layers=args.get('conv_layers',2) - dense_layers=args.get('dense_layers',2) - dropout=args.get('dropout',0.5) - classes=3 - in_channels=5 - try: - from TorchModels import CNN - except: - from .TorchModels import CNN - model = CNN(conv_layers=conv_layers, dense_layers=dense_layers, dropout=dropout, classes=classes, in_channels=in_channels) - return model - -try: - from skopt.space import Real, Integer, Categorical - make_mnist_model.parameter_range = [ - Integer(10,50, name='nb_filters'), - Integer(2,10, name='pool_size'), - Integer(2,10, name='kernel_size'), - Integer(50,200, name='dense'), - Real(0.0, 1.0, name='dropout') - ] - make_mnist_torch_model.parameter_range = [ - Integer(2,10, name='kernel_size'), - Integer(50,200, name='dense'), - Real(0.0, 1.0, name='dropout') - ] - make_topclass_model.parameter_range = [ - Integer(1,6, name='conv_layers'), - Integer(1,6, name='dense_layers'), - Integer(1,6, name='kernel_size'), - Real(0.0, 1.0, name='dropout') - ] - make_topclass_torch_model.parameter_range = [ - Integer(1,6, name='conv_layers'), - Integer(1,6, name='dense_layers'), - Real(0.0,1.0, name='dropout') - ] - make_cifar10_model.parameter_range = [ - Integer(10,300, name='nb_filters1'), - Integer(10,300, name='nb_filters2'), - Integer(10,300, name='nb_filters3'), - Integer(50,1000, name='dense1'), - Integer(50,1000, name='dense2'), - Real(0.0, 1.0, name='dropout1'), - Real(0.0, 1.0, name='dropout2') - ] -except: - pass - diff --git a/models/get_3d.py b/models/get_3d.py deleted file mode 100644 index 53a4a7b..0000000 --- a/models/get_3d.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -import glob -try: - import h5py - pass -except: - print ("hum") -import numpy as np -import sys - -def get_data(datafile): - #get data for training - #print ('Loading Data from .....', datafile) - f=h5py.File(datafile,'r') - y=f.get('target') - X=np.array(f.get('ECAL')) - y=(np.array(y[:,1])) - X[X < 1e-4] = 0 - X = np.expand_dims(X, axis=-1) - X = X.astype(np.float32) - y = y.astype(np.float32) - y = y/100. - ecal = np.squeeze(np.sum(X, axis=(1, 2, 3))) - print (X.shape) - print (y.shape) - print (ecal.shape) - - f.close() - return X, y, ecal - -dest='/data/shared/3DGAN/' -import socket -host = os.environ.get('HOST', os.environ.get('HOSTNAME',socket.gethostname())) -if 'daint' in host: - dest='/scratch/snx3000/vlimant/3DGAN/' -if 'titan' in host: - dest='/ccs/proj/csc291/DATA/3DGAN/' - -sub_split = int(sys.argv[1]) if len(sys.argv)>1 else 1 - -for F in glob.glob('/bigdata/shared/LCD/NewV1/*scan/*.h5'): - _,d,f = F.rsplit('/',2) - if not 'Ele' in d: continue - X = None - if sub_split==1: - nf = '%s/%s_%s.h5'%( dest,d,f) - if os.path.isfile( nf) : - continue - print ("processing files",F,"into",nf) - if X is None: - X,y,ecal = get_data(F) - o = h5py.File(nf,'w') - o['X'] = X - o.create_group("y") - o['y']['a'] = np.ones(y.shape) - o['y']['b'] = y - o['y']['c'] = ecal - o.close() - else: - for sub in range(sub_split): - nf = '%s/%s_%s_sub%s.h5'%(dest, d,f,sub) - if os.path.isfile( nf) : - continue - print ("processing files",F,"into",nf) - if X is None: - X,y,ecal = get_data(F) - N = X.shape[0] - splits = [i*N/sub_split for i in range(sub_split)]+[-1] - o = h5py.File(nf,'w') - o['X'] = X[splits[sub]:splits[sub+1],...] - o.create_group("y") - o['y']['a'] = np.ones(y[splits[sub]:splits[sub+1],...].shape) - o['y']['b'] = y[splits[sub]:splits[sub+1],...] - o['y']['c'] = ecal[splits[sub]:splits[sub+1],...] - o.close() - X = None - -if sub_split == 1: - sub_files = lambda f:not 'sub' in f -else: - sub_files = lambda f:'sub' in f - -open('train_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:-4]))) -open('test_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-4:]))) - -open('train_small_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:-4]))) -open('test_small_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-4:]))) - -open('train_7_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:7]))) -open('test_1_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-1:]))) - diff --git a/models/get_cifar10.py b/models/get_cifar10.py deleted file mode 100644 index bebcb8a..0000000 --- a/models/get_cifar10.py +++ /dev/null @@ -1,53 +0,0 @@ -### This script downloads the cifar10 dataset, unpacks it, splits it into four pieces, and saves -# each piece in a separate h5 file. - -from numpy import array_split -from keras.datasets import cifar10 -from keras.utils import np_utils -from keras import backend as K -import h5py -import sys - -(X_train, Y_train), (X_test, Y_test) = cifar10.load_data() - -img_rows = 32 -img_cols = 32 -if K.image_dim_ordering() == 'th': - X_train = X_train.reshape(X_train.shape[0], 3, img_rows, img_cols) - X_test = X_test.reshape(X_test.shape[0], 3, img_rows, img_cols) - input_shape = (3, img_rows, img_cols) -else: - X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 3) - X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 3) - input_shape = (img_rows, img_cols, 3) - -num_train_pieces = int(sys.argv[1]) if len(sys.argv)>1 else 24 -num_test_pieces = int(sys.argv[2]) if len(sys.argv)>1 else 4 -split_X_train = [ X.astype('float32') / 255 for X in array_split(X_train, num_train_pieces) ] -split_Y_train = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_train, num_train_pieces) ] -split_X_test = [ X.astype('float32') / 255 for X in array_split(X_test, num_test_pieces) ] -split_Y_test = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_test, num_test_pieces) ] - -train_list = [] -for i in range(num_train_pieces): - train_name = "cifar10_train_%d.h5" % i - train_list.append(train_name+"\n") - train_outfile = h5py.File( train_name, 'w' ) - train_outfile.create_dataset( "features", data=split_X_train[i] ) - train_outfile.create_dataset( "labels", data=split_Y_train[i] ) - train_outfile.close() -with open('train_cifar10.list', 'w') as train_list_file: - for f in train_list: - train_list_file.write(f) - -test_list = [] -for i in range(num_test_pieces): - test_name = "cifar10_test_%d.h5" % i - test_list.append(test_name+"\n") - test_outfile = h5py.File( test_name, 'w' ) - test_outfile.create_dataset( "features", data=split_X_test[i] ) - test_outfile.create_dataset( "labels", data=split_Y_test[i] ) - test_outfile.close() -with open('test_cifar10.list', 'w') as test_list_file: - for f in test_list: - test_list_file.write(f) diff --git a/models/get_mnist.py b/models/get_mnist.py deleted file mode 100644 index b5b2a14..0000000 --- a/models/get_mnist.py +++ /dev/null @@ -1,53 +0,0 @@ -### This script downloads the MNIST dataset, unpacks it, splits it into four pieces, and saves -# each piece in a separate h5 file. - -from numpy import array_split -from keras.datasets import mnist -from keras.utils import np_utils -from keras import backend as K -import h5py -import sys - -(X_train, Y_train), (X_test, Y_test) = mnist.load_data() - -img_rows = 28 -img_cols = 28 -if K.image_dim_ordering() == 'th': - X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols) - X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1) - X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1) -input_shape = (img_rows, img_cols, 1) - -num_train_pieces = int(sys.argv[1]) if len(sys.argv)>1 else 24 -num_test_pieces = int(sys.argv[2]) if len(sys.argv)>1 else 4 -split_X_train = [ X.astype('float32') / 255 for X in array_split(X_train, num_train_pieces) ] -split_Y_train = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_train, num_train_pieces) ] -split_X_test = [ X.astype('float32') / 255 for X in array_split(X_test, num_test_pieces) ] -split_Y_test = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_test, num_test_pieces) ] - -train_list = [] -for i in range(num_train_pieces): - train_name = "mnist_train_%d.h5" % i - train_list.append(train_name+"\n") - train_outfile = h5py.File( train_name, 'w' ) - train_outfile.create_dataset( "features", data=split_X_train[i] ) - train_outfile.create_dataset( "labels", data=split_Y_train[i] ) - train_outfile.close() -with open('train_mnist.list', 'w') as train_list_file: - for f in train_list: - train_list_file.write(f) - -test_list = [] -for i in range(num_test_pieces): - test_name = "mnist_test_%d.h5" % i - test_list.append(test_name+"\n") - test_outfile = h5py.File( test_name, 'w' ) - test_outfile.create_dataset( "features", data=split_X_test[i] ) - test_outfile.create_dataset( "labels", data=split_Y_test[i] ) - test_outfile.close() -with open('test_mnist.list', 'w') as test_list_file: - for f in test_list: - test_list_file.write(f) diff --git a/models/get_topclass.py b/models/get_topclass.py deleted file mode 100644 index f3e1998..0000000 --- a/models/get_topclass.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import glob -import sys - -dest='/bigdata/shared/LCDJets_Abstract_IsoLep_lt_20' -import socket -host = os.environ.get('HOST', os.environ.get('HOSTNAME',socket.gethostname())) -if 'titan' in host: - dest='/ccs/proj/csc291/DATA/LCDJets_Abstract_IsoLep_lt_20' -train = glob.glob(dest+'/train/*.h5') -test = glob.glob(dest+'/val/*.h5') - -N=10 -Nt=N/5 -if len(sys.argv)>=1: - a = sys.argv[1] - if a.isdigit(): - N = int(a) - Nt=N/5 - else: - N,Nt = map(int, a.split(',')) - - -open('train_topclass.list','w').write( '\n'.join(sorted( train[:N] ))) -open('test_topclass.list','w').write( '\n'.join(sorted( test[:Nt] ))) diff --git a/nnlo/data/__init__.py b/nnlo/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nnlo/data/get_3d.py b/nnlo/data/get_3d.py new file mode 100644 index 0000000..bd8b4fa --- /dev/null +++ b/nnlo/data/get_3d.py @@ -0,0 +1,92 @@ +import os +import glob +import logging +try: + import h5py + pass +except: + logging.info("import h5py failed") +import numpy as np +import sys + +def get_data(datafile): + #get data for training + f=h5py.File(datafile,'r') + y=f.get('target') + X=np.array(f.get('ECAL')) + y=(np.array(y[:,1])) + X[X < 1e-4] = 0 + X = np.expand_dims(X, axis=-1) + X = X.astype(np.float32) + y = y.astype(np.float32) + y = y/100. + ecal = np.squeeze(np.sum(X, axis=(1, 2, 3))) + logging.info("X shape {}; y shape {}; ecal shape {}".format(str(X.shape)), str(y.shape), str(ecal.shape)) + + f.close() + return X, y, ecal + +def main(): + dest='/data/shared/3DGAN/' + import socket + host = os.environ.get('HOST', os.environ.get('HOSTNAME',socket.gethostname())) + if 'daint' in host: + dest='/scratch/snx3000/vlimant/3DGAN/' + if 'titan' in host: + dest='/ccs/proj/csc291/DATA/3DGAN/' + + sub_split = int(sys.argv[1]) if len(sys.argv)>1 else 1 + + for F in glob.glob('/bigdata/shared/LCD/NewV1/*scan/*.h5'): + _,d,f = F.rsplit('/',2) + if not 'Ele' in d: continue + X = None + if sub_split==1: + nf = '%s/%s_%s.h5'%( dest,d,f) + if os.path.isfile( nf) : + continue + logging.info("processing files {} into {}".format(F,nf)) + if X is None: + X,y,ecal = get_data(F) + o = h5py.File(nf,'w') + o['X'] = X + o.create_group("y") + o['y']['a'] = np.ones(y.shape) + o['y']['b'] = y + o['y']['c'] = ecal + o.close() + else: + for sub in range(sub_split): + nf = '%s/%s_%s_sub%s.h5'%(dest, d,f,sub) + if os.path.isfile( nf) : + continue + logging.info("processing files {} into {}".format(F,nf)) + if X is None: + X,y,ecal = get_data(F) + N = X.shape[0] + splits = [i*N/sub_split for i in range(sub_split)]+[-1] + o = h5py.File(nf,'w') + o['X'] = X[splits[sub]:splits[sub+1],...] + o.create_group("y") + o['y']['a'] = np.ones(y[splits[sub]:splits[sub+1],...].shape) + o['y']['b'] = y[splits[sub]:splits[sub+1],...] + o['y']['c'] = ecal[splits[sub]:splits[sub+1],...] + o.close() + X = None + + if sub_split == 1: + sub_files = lambda f:not 'sub' in f + else: + sub_files = lambda f:'sub' in f + + open('train_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:-4]))) + open('test_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-4:]))) + + open('train_small_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:-4]))) + open('test_small_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-4:]))) + + open('train_7_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:7]))) + open('test_1_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-1:]))) + +if __name__ == '__main__': + main() diff --git a/nnlo/data/get_cifar10.py b/nnlo/data/get_cifar10.py new file mode 100644 index 0000000..82425bf --- /dev/null +++ b/nnlo/data/get_cifar10.py @@ -0,0 +1,57 @@ +### This script downloads the cifar10 dataset, unpacks it, splits it into four pieces, and saves +# each piece in a separate h5 file. + +from numpy import array_split +from tensorflow.keras.datasets import cifar10 +from tensorflow.python.keras.utils import np_utils +from tensorflow.python.keras import backend as K +import h5py +import os + +def main(argv): + (X_train, Y_train), (X_test, Y_test) = cifar10.load_data() + + img_rows = 32 + img_cols = 32 + if K.image_data_format() == 'channels_first': + X_train = X_train.reshape(X_train.shape[0], 3, img_rows, img_cols) + X_test = X_test.reshape(X_test.shape[0], 3, img_rows, img_cols) + input_shape = (3, img_rows, img_cols) + else: + X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 3) + X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 3) + input_shape = (img_rows, img_cols, 3) + + num_train_pieces = int(argv[1]) if len(argv)>1 else 24 + num_test_pieces = int(argv[2]) if len(argv)>1 else 4 + split_X_train = [ X.astype('float32') / 255 for X in array_split(X_train, num_train_pieces) ] + split_Y_train = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_train, num_train_pieces) ] + split_X_test = [ X.astype('float32') / 255 for X in array_split(X_test, num_test_pieces) ] + split_Y_test = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_test, num_test_pieces) ] + + train_list = [] + for i in range(num_train_pieces): + train_name = f"{os.getcwd()}/cifar10_train_%d.h5" % i + train_list.append(train_name+"\n") + train_outfile = h5py.File( train_name, 'w' ) + train_outfile.create_dataset( "features", data=split_X_train[i] ) + train_outfile.create_dataset( "labels", data=split_Y_train[i] ) + train_outfile.close() + with open('train_cifar10.list', 'w') as train_list_file: + for f in train_list: + train_list_file.write(f) + + test_list = [] + for i in range(num_test_pieces): + test_name = f"{os.getcwd()}/cifar10_test_%d.h5" % i + test_list.append(test_name+"\n") + test_outfile = h5py.File( test_name, 'w' ) + test_outfile.create_dataset( "features", data=split_X_test[i] ) + test_outfile.create_dataset( "labels", data=split_Y_test[i] ) + test_outfile.close() + with open('test_cifar10.list', 'w') as test_list_file: + for f in test_list: + test_list_file.write(f) + +if __name__ == '__main__': + main() diff --git a/nnlo/data/get_mnist.py b/nnlo/data/get_mnist.py new file mode 100644 index 0000000..93675f0 --- /dev/null +++ b/nnlo/data/get_mnist.py @@ -0,0 +1,57 @@ +### This script downloads the MNIST dataset, unpacks it, splits it into four pieces, and saves +# each piece in a separate h5 file. + +from numpy import array_split +from tensorflow.keras.datasets import mnist +from tensorflow.python.keras.utils import np_utils +from tensorflow.python.keras import backend as K +import h5py +import os + +def main(argv): + (X_train, Y_train), (X_test, Y_test) = mnist.load_data() + + img_rows = 28 + img_cols = 28 + if K.image_data_format() == 'channels_first': + X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols) + X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols) + input_shape = (1, img_rows, img_cols) + else: + X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1) + X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1) + input_shape = (img_rows, img_cols, 1) + + num_train_pieces = int(argv[1]) if len(argv)>1 else 24 + num_test_pieces = int(argv[2]) if len(argv)>1 else 4 + split_X_train = [ X.astype('float32') / 255 for X in array_split(X_train, num_train_pieces) ] + split_Y_train = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_train, num_train_pieces) ] + split_X_test = [ X.astype('float32') / 255 for X in array_split(X_test, num_test_pieces) ] + split_Y_test = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_test, num_test_pieces) ] + + train_list = [] + for i in range(num_train_pieces): + train_name = f"{os.getcwd()}/mnist_train_%d.h5" % i + train_list.append(train_name+"\n") + train_outfile = h5py.File( train_name, 'w' ) + train_outfile.create_dataset( "features", data=split_X_train[i] ) + train_outfile.create_dataset( "labels", data=split_Y_train[i] ) + train_outfile.close() + with open('train_mnist.list', 'w') as train_list_file: + for f in train_list: + train_list_file.write(f) + + test_list = [] + for i in range(num_test_pieces): + test_name = f"{os.getcwd()}/mnist_test_%d.h5" % i + test_list.append(os.getcwd()+test_name+"\n") + test_outfile = h5py.File( test_name, 'w' ) + test_outfile.create_dataset( "features", data=split_X_test[i] ) + test_outfile.create_dataset( "labels", data=split_Y_test[i] ) + test_outfile.close() + with open('test_mnist.list', 'w') as test_list_file: + for f in test_list: + test_list_file.write(f) + +if __name__ == '__main__': + main() diff --git a/nnlo/data/get_topclass.py b/nnlo/data/get_topclass.py new file mode 100644 index 0000000..b30c167 --- /dev/null +++ b/nnlo/data/get_topclass.py @@ -0,0 +1,29 @@ +import os +import glob +import sys + +def main(): + dest='/bigdata/shared/LCDJets_Abstract_IsoLep_lt_20' + import socket + host = os.environ.get('HOST', os.environ.get('HOSTNAME',socket.gethostname())) + if 'titan' in host: + dest='/ccs/proj/csc291/DATA/LCDJets_Abstract_IsoLep_lt_20' + train = glob.glob(dest+'/train/*.h5') + test = glob.glob(dest+'/val/*.h5') + + N=10 + Nt=N/5 + if len(sys.argv)>=1: + a = sys.argv[1] + if a.isdigit(): + N = int(a) + Nt=N/5 + else: + N,Nt = map(int, a.split(',')) + + + open('train_topclass.list','w').write( '\n'.join(sorted( train[:N] ))) + open('test_topclass.list','w').write( '\n'.join(sorted( test[:Nt] ))) + +if __name__ == '__main__': + main() diff --git a/nnlo/data/getdata.py b/nnlo/data/getdata.py new file mode 100644 index 0000000..f5db855 --- /dev/null +++ b/nnlo/data/getdata.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# Rui Zhang 7.2020 +# rui.zhang@cern.ch + +import sys + +def main(): + command = sys.argv[1] + argv = sys.argv[2:] + if command.lower() == 'mnist': + from nnlo.data.get_mnist import main + main(argv) + elif command.lower() == 'cifar10': + from nnlo.data.get_cifar10 import main + main(argv) + else: + raise RuntimeError('Unknown command: {}'.format(command)) + +if __name__ == '__main__': + main() diff --git a/MPIGDriver.py b/nnlo/driver/MPIGDriver.py similarity index 99% rename from MPIGDriver.py rename to nnlo/driver/MPIGDriver.py index 52d5e14..cd39854 100755 --- a/MPIGDriver.py +++ b/nnlo/driver/MPIGDriver.py @@ -21,7 +21,7 @@ import socket -if __name__ == '__main__': +def main(): from TrainingDriver import add_loader_options parser = argparse.ArgumentParser() parser.add_argument('--verbose',help='display metrics for each training batch',action='store_true') @@ -162,3 +162,6 @@ comm.Barrier() logging.info("Terminating") + +if __name__ == '__main__': + main() diff --git a/OptimizationDriver.py b/nnlo/driver/OptimizationDriver.py similarity index 99% rename from OptimizationDriver.py rename to nnlo/driver/OptimizationDriver.py index 8b6359b..a140971 100755 --- a/OptimizationDriver.py +++ b/nnlo/driver/OptimizationDriver.py @@ -105,9 +105,7 @@ def make_opt_parser(): return parser - -if __name__ == '__main__': - +def main(): logging.info("Process is on {}".format(socket.gethostname())) parser = make_opt_parser() args = parser.parse_args() @@ -323,3 +321,6 @@ def make_opt_parser(): checkpoint=args.checkpoint, checkpoint_interval=args.checkpoint_interval) block.run() + +if __name__ == '__main__': + main() diff --git a/TrainingDriver.py b/nnlo/driver/TrainingDriver.py similarity index 81% rename from TrainingDriver.py rename to nnlo/driver/TrainingDriver.py index cc8a654..01be474 100755 --- a/TrainingDriver.py +++ b/nnlo/driver/TrainingDriver.py @@ -12,6 +12,7 @@ from mpi4py import MPI from time import time,sleep +import importlib from nnlo.mpi.manager import MPIManager, get_device from nnlo.train.algo import Algo @@ -27,6 +28,7 @@ def add_log_option(parser): # logging configuration parser.add_argument('--log-file', default=None, dest='log_file', help='log file to write, in additon to output stream') parser.add_argument('--log-level', default='info', dest='log_level', help='log level (debug, info, warn, error)') + parser.add_argument('--output', default='./', dest='output', help='output folder') def add_master_option(parser): parser.add_argument('--master-gpu',help='master process should get a gpu', @@ -89,13 +91,13 @@ def add_train_options(parser): parser.add_argument('--thread_validation', help='run a single process', action='store_true') # model arguments - parser.add_argument('--model', help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') + parser.add_argument('--model', choices=['mnist', 'mnist_torch', 'cifar10', 'cifar10_torch'], help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') parser.add_argument('--trial-name', help='descriptive name for trial', default='train', dest='trial_name') # training data arguments - parser.add_argument('--train_data', help='text file listing data inputs for training', default=None) - parser.add_argument('--val_data', help='text file lis`ting data inputs for validation', default=None) + parser.add_argument('--train_data', help='text file listing data inputs for training', required=True) + parser.add_argument('--val_data', help='text file lis`ting data inputs for validation', required=True) parser.add_argument('--features-name', help='name of HDF5 dataset with input features', default='features', dest='features_name') parser.add_argument('--labels-name', help='name of HDF5 dataset with output labels', @@ -198,21 +200,11 @@ def make_algo( args, use_tf, comm, validate_every ): def make_train_val_lists(m_module, args): train_list = val_list = [] - if args.train_data: - with open(args.train_data) as train_list_file: - train_list = [ s.strip() for s in train_list_file.readlines() ] - elif m_module is not None: - train_list = m_module.get_train() - else: - logging.info("no training data provided") + with open(args.train_data) as train_list_file: + train_list = [ s.strip() for s in train_list_file.readlines() ] - if args.val_data: - with open(args.val_data) as val_list_file: - val_list = [ s.strip() for s in val_list_file.readlines() ] - elif m_module is not None: - val_list = m_module.get_val() - else: - logging.info("no validation data provided") + with open(args.val_data) as val_list_file: + val_list = [ s.strip() for s in val_list_file.readlines() ] if not train_list: logging.error("No training data provided") @@ -220,12 +212,7 @@ def make_train_val_lists(m_module, args): logging.error("No validation data provided") return (train_list, val_list) -def make_features_labels(m_module, args): - features_name = m_module.get_features() if m_module is not None and hasattr(m_module,"get_features") else args.features_name - labels_name = m_module.get_labels() if m_module is not None and hasattr(m_module,"get_labels") else args.labels_name - return (features_name, labels_name) - -if __name__ == '__main__': +def main(): parser = make_train_parser() args = parser.parse_args() initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level) @@ -234,8 +221,21 @@ def make_features_labels(m_module, args): if 'torch' in args.model: a_backend = 'torch' - m_module = __import__(args.model.replace('.py','').replace('/', '.'), fromlist=[None]) if '.py' in args.model else None - (features_name, labels_name) = make_features_labels(m_module, args) + m_module, model_source = None, None + try: + if args.model == 'mnist': + m_module = importlib.import_module(f'nnlo.models.model_mnist_tf') + model_source = 'models/model_mnist_tf.py' + elif args.model == 'mnist_torch': + m_module = importlib.import_module(f'nnlo.models.model_mnist_torch') + model_source = 'models/model_mnist_torch.py' + elif args.model == 'cifar10': + m_module = importlib.import_module(f'nnlo.models.model_cifar10_tf') + model_source = 'models/model_cifar10_tf.py' + except Exception as e: + logging.fatal(e) + + (features_name, labels_name) = args.features_name, args.labels_name (train_list, val_list) = make_train_val_lists(m_module, args) comm = MPI.COMM_WORLD.Dup() @@ -255,37 +255,42 @@ def make_features_labels(m_module, args): if use_torch: logging.debug("Using pytorch") - model_builder = ModelPytorch(comm, source=args.model, weights=model_weights, gpus=1 if 'gpu' in device else 0) + model_builder = ModelPytorch(comm, source=model_source, weights=model_weights, gpus=1 if 'gpu' in device else 0) else: logging.debug("Using TensorFlow") os.environ['KERAS_BACKEND'] = 'tensorflow' + import tensorflow as tf import_keras() - import keras.backend as K - gpu_options=K.tf.GPUOptions( - per_process_gpu_memory_fraction=0.1, #was 0.0 - allow_growth = True, - visible_device_list = device[-1] if 'gpu' in device else '') - gpu_options=K.tf.GPUOptions( - per_process_gpu_memory_fraction=0.0, - allow_growth = True,) + #tf.config.gpu.set_per_process_memory_fraction(0.1) + #gpu_options=K.tf.GPUOptions( + # per_process_gpu_memory_fraction=0.1, #was 0.0 + # allow_growth = True, + # visible_device_list = device[-1] if 'gpu' in device else '') + #gpu_options=K.tf.GPUOptions( + # per_process_gpu_memory_fraction=0.0, + # allow_growth = True,) + gpu_devices = tf.config.experimental.list_physical_devices('GPU') + for device in gpu_devices: + tf.config.experimental.set_memory_growth(device, True) + #NTHREADS=(2,1) - NTHREADS=None - if NTHREADS is None: - K.set_session( K.tf.Session( config=K.tf.ConfigProto( - allow_soft_placement=True, log_device_placement=False, - gpu_options=gpu_options - ) ) ) - else: - K.set_session( K.tf.Session( config=K.tf.ConfigProto( - allow_soft_placement=True, log_device_placement=False, - gpu_options=gpu_options, - intra_op_parallelism_threads=NTHREADS[0], - inter_op_parallelism_threads=NTHREADS[1], - ) ) ) + #NTHREADS=None + #if NTHREADS is None: + # K.set_session( K.tf.Session( config=K.tf.ConfigProto( + # allow_soft_placement=True, log_device_placement=False, + # gpu_options=gpu_options + # ) ) ) + #else: + # K.set_session( K.tf.Session( config=K.tf.ConfigProto( + # allow_soft_placement=True, log_device_placement=False, + # gpu_options=gpu_options, + # intra_op_parallelism_threads=NTHREADS[0], + # inter_op_parallelism_threads=NTHREADS[1], + # ) ) ) - model_builder = ModelTensorFlow( comm, source=args.model, weights=model_weights) + model_builder = ModelTensorFlow( comm, source=model_source, weights=model_weights) data = make_loader(args, features_name, labels_name, train_list) @@ -313,8 +318,8 @@ def make_features_labels(m_module, args): else: model_name = os.path.basename(args.model).replace('.json','') - json_name = '_'.join([model_name,args.trial_name,"history.json"]) - tl_json_name = '_'.join([model_name,args.trial_name,"timeline.json"]) + json_name = args.output + '/' + '_'.join([model_name,args.trial_name,"history.json"]) + tl_json_name = args.output + '/' + '_'.join([model_name,args.trial_name,"timeline.json"]) # Process 0 launches the training procedure if comm.Get_rank() == 0: @@ -333,3 +338,6 @@ def make_features_labels(m_module, args): comm.barrier() logging.info("Terminating") if args.timeline: Timeline.collect(clean=True, file_name=tl_json_name) + +if __name__ == '__main__': + main() diff --git a/nnlo/driver/__init__.py b/nnlo/driver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nnlo/models/__init__.py b/nnlo/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nnlo/models/model_cifar10_tf.py b/nnlo/models/model_cifar10_tf.py new file mode 100644 index 0000000..7b693c4 --- /dev/null +++ b/nnlo/models/model_cifar10_tf.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def get_name(): + return 'cifar10' + +def get_model(**args): + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D + import tensorflow.keras.backend as K + if args:logging.debug("receiving arguments {}".format(args)) + nb_classes = 10 + img_rows, img_cols = 32, 32 + + # use 1 kernel size for all convolutional layers + ks = args.get('kernel_size', 3) + + # tune the number of filters for each convolution layer + nb_filters1 = args.get('nb_filters1', 48) + nb_filters2 = args.get('nb_filters2', 96) + nb_filters3 = args.get('nb_filters3', 192) + + # tune the pool size once + ps = args.get('pool_size', 2) + pool_size = (ps,ps) + + # tune the dropout rates independently + do4 = args.get('dropout1', 0.25) + do5 = args.get('dropout2', 0.5) + + # tune the dense layers independently + dense1 = args.get('dense1', 512) + dense2 = args.get('dense2', 256) + + if K.image_data_format() == 'channels_first': + input_shape = (3, img_rows, img_cols) + else: + input_shape = (img_rows, img_cols, 3) + + #act = 'sigmoid' + act = 'relu' + + i = Input( input_shape) + l = Conv2D(nb_filters1,( ks, ks), padding='same', activation = act)(i) + l = MaxPooling2D(pool_size=pool_size)(l) + #l = Dropout(do1)(l) + + l = Conv2D(nb_filters2, (ks, ks), padding='same',activation=act)(l) + #l = Conv2D(nb_filters2, (ks, ks))(l) + l = MaxPooling2D(pool_size=pool_size)(l) + #l = Dropout(do2)(l) + + l = Conv2D(nb_filters3, (ks, ks), padding='same',activation=act)(l) + #l = Conv2D(nb_filters3, (ks, ks))(l) + l = MaxPooling2D(pool_size=pool_size)(l) + #l = Dropout(do3)(l) + + l = Flatten()(l) + l = Dense(dense1,activation=act)(l) + l = Dropout(do4)(l) + l = Dense(dense2,activation=act)(l) + l =Dropout(do5)(l) + + o = Dense(nb_classes, activation='softmax')(l) + + model = Model(inputs=i, outputs=o) + #model.summary() + + return model + +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Integer(10,300, name='nb_filters1'), + Integer(10,300, name='nb_filters2'), + Integer(10,300, name='nb_filters3'), + Integer(50,1000, name='dense1'), + Integer(50,1000, name='dense2'), + Real(0.0, 1.0, name='dropout1'), + Real(0.0, 1.0, name='dropout2') +] diff --git a/nnlo/models/model_example_tf.py b/nnlo/models/model_example_tf.py new file mode 100644 index 0000000..43d8c2e --- /dev/null +++ b/nnlo/models/model_example_tf.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def get_name(): + return 'example' + +def get_model(**args): + """Example model from keras documentation""" + from tensorflow.keras.models import Sequential + from tensorflow.keras.layers import Dense, Activation + model = Sequential() + model.add(Dense(output_dim=64, input_dim=100)) + model.add(Activation("relu")) + model.add(Dense(output_dim=10)) + model.add(Activation("softmax")) + return model diff --git a/examples/example_hls4mlgru.py b/nnlo/models/model_hls4mlgru.py similarity index 96% rename from examples/example_hls4mlgru.py rename to nnlo/models/model_hls4mlgru.py index fa168d7..2eb3783 100644 --- a/examples/example_hls4mlgru.py +++ b/nnlo/models/model_hls4mlgru.py @@ -1,6 +1,6 @@ #from keras.activations import relu, selu, elu -from keras.models import Model, Sequential -from keras.layers import Dense, Input, GRU, Dropout, Flatten, Permute +from keras.models import Model +from keras.layers import Dense, Input, GRU, Dropout, Permute import numpy as np def get_model(**args): diff --git a/examples/example_jedi_torch.py b/nnlo/models/model_jedi_torch.py similarity index 99% rename from examples/example_jedi_torch.py rename to nnlo/models/model_jedi_torch.py index df826fc..cb6fa6b 100644 --- a/examples/example_jedi_torch.py +++ b/nnlo/models/model_jedi_torch.py @@ -213,7 +213,6 @@ def get_labels(): if __name__ == "__main__": - print("do the data conversion") import glob import h5py import numpy as np @@ -230,5 +229,4 @@ def get_labels(): fo['X'] = X fo['Y'] = Y fo.close() - print(f,"converted") diff --git a/nnlo/models/model_mnist_tf.py b/nnlo/models/model_mnist_tf.py new file mode 100644 index 0000000..8c15019 --- /dev/null +++ b/nnlo/models/model_mnist_tf.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def get_name(): + return 'mnist' + +def get_model(**args): + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D + import tensorflow.keras.backend as K + """MNIST ConvNet from keras/examples/mnist_cnn.py""" + #np.random.seed(1337) # for reproducibility + if args:logging.debug("receiving arguments {}".format(args)) + nb_classes = 10 + # input image dimensions + img_rows, img_cols = 28, 28 + # number of convolutional filters to use + nb_filters = args.get('nb_filters',32) + # size of pooling area for max pooling + ps = args.get('pool_size',2) + + # convolution kernel size + ks = args.get('kernel_size',3) + do = args.get('dropout', 0.25) + dense = args.get('dense', 128) + + pool_size = (ps,ps) + if K.image_data_format() == 'channels_first': + input_shape = (1, img_rows, img_cols) + else: + input_shape = (img_rows, img_cols, 1) + model = Sequential() + model.add(Convolution2D(nb_filters, (ks, ks), + padding='valid', + input_shape=input_shape)) + model.add(Activation('relu')) + model.add(Convolution2D(nb_filters, (ks, ks))) + model.add(Activation('relu')) + model.add(MaxPooling2D(pool_size=pool_size)) + model.add(Dropout(do)) + model.add(Flatten()) + model.add(Dense(dense)) + model.add(Activation('relu')) + model.add(Dropout(do)) + model.add(Dense(nb_classes)) + model.add(Activation('softmax')) + return model + +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Integer(10,50, name='nb_filters'), + Integer(2,10, name='pool_size'), + Integer(2,10, name='kernel_size'), + Integer(50,200, name='dense'), + Real(0.0, 1.0, name='dropout') +] + diff --git a/nnlo/models/model_mnist_torch.py b/nnlo/models/model_mnist_torch.py new file mode 100644 index 0000000..e0a6df6 --- /dev/null +++ b/nnlo/models/model_mnist_torch.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch +import torch.nn as nn +import torch.nn.functional as F + +def get_name(): + return 'mnist_torch' + +class MNistNet(nn.Module): + def __init__(self, **args): + super(MNistNet, self).__init__() + ks = int(args.get('kernel_size',5)) + do = float(args.get('dropout',0.5)) + dense = int(args.get('dense',50)) + self.conv1 = nn.Conv2d(1, 10, kernel_size=ks) + self.conv2 = nn.Conv2d(10, 20, kernel_size=ks) + self.conv2_drop = nn.Dropout2d(do) + self.fc1 = nn.Linear(320, dense) + self.fc2 = nn.Linear(dense, 10) + + def forward(self, x): + x = x.permute(0,3,1,2).float() + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + #return F.log_softmax(x, dim=1) + #return F.softmax(x) + #return F.cross_entropy(x) + return x + +def get_model(**args): + if args:logging.debug("receiving arguments {}".format(args)) + model = MNistNet(**args) + return model + +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Integer(2,10, name='kernel_size'), + Integer(50,200, name='dense'), + Real(0.0, 1.0, name='dropout') +] diff --git a/nnlo/models/model_topclass_tf.py b/nnlo/models/model_topclass_tf.py new file mode 100644 index 0000000..18cc99b --- /dev/null +++ b/nnlo/models/model_topclass_tf.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def get_name(): + return 'topclass' + +def get_model(**args): + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D + if args:logging.debug("receiving arguments {}".format(args)) + conv_layers=args.get('conv_layers',2) + dense_layers=args.get('dense_layers',2) + dropout=args.get('dropout',0.2) + kernel = args.get('kernel_size',3) + classes=3 + in_channels=5 + in_ch = in_channels + ## the trace in the input file is 750, 150, 94, 5 + input = Input( (150,94,in_ch)) + ## convs + c = input + for i in range(conv_layers): + channel_in = in_ch*((i+1)%5) + channel_out = in_ch*((i+2)%5) + if channel_in == 0: channel_in += 1 + if channel_out == 0: channel_out += 1 + c = Conv2D( filters=channel_out, kernel_size=(kernel,kernel) , strides=1, padding="same", activation = 'relu') (c) + c = Conv2D(1, (kernel,kernel), activation = 'relu',strides=2, padding="same")(c) + + ## pooling + pool = args.get('pool', 10) + m = MaxPooling2D((pool,pool))(c) + f = Flatten()(m) + d = f + base = args.get('hidden_factor',5)*100 + for i in range(dense_layers): + N = int(base//(2**(i+1))) + d = Dense( N, activation='relu')(d) + if dropout: + d = Dropout(dropout)(d) + o = Dense(classes, activation='softmax')(d) + + model = Model(inputs=input, outputs=o) + #model.summary() + return model + +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Integer(1,6, name='conv_layers'), + Integer(1,6, name='dense_layers'), + Integer(1,6, name='kernel_size'), + Real(0.0, 1.0, name='dropout') +] diff --git a/models/TorchModels.py b/nnlo/models/model_topclass_torch.py similarity index 71% rename from models/TorchModels.py rename to nnlo/models/model_topclass_torch.py index fa663e7..6560be9 100644 --- a/models/TorchModels.py +++ b/nnlo/models/model_topclass_torch.py @@ -1,9 +1,10 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + import torch from torch.autograd import Variable import torch.nn as nn -import torch.nn.parallel -import torch.backends.cudnn as cudnn -import torch.distributed as dist import torch.optim import torch.utils.data.distributed import torchvision.transforms as transforms @@ -11,32 +12,7 @@ import torchvision.models as models import torch.nn.functional as F import numpy - -class MNistNet(nn.Module): - def __init__(self, **args): - super(MNistNet, self).__init__() - ks = int(args.get('kernel_size',5)) - do = float(args.get('dropout',0.5)) - dense = int(args.get('dense',50)) - self.conv1 = nn.Conv2d(1, 10, kernel_size=ks) - self.conv2 = nn.Conv2d(10, 20, kernel_size=ks) - self.conv2_drop = nn.Dropout2d(do) - self.fc1 = nn.Linear(320, dense) - self.fc2 = nn.Linear(dense, 10) - - def forward(self, x): - x = x.permute(0,3,1,2).float() - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - #return F.log_softmax(x, dim=1) - #return F.softmax(x) - #return F.cross_entropy(x) - return x - +import logging ### Build a customized CNN with given hyperparameters @@ -65,7 +41,7 @@ def __init__(self, dense_layers, dropout ,base): for i in range(dense_layers): il = int(base//(2**i)) ol = int(base//(2**(i+1))) - print (il,"=>",ol) + logging.info("{} =>> {}".format(il,ol)) self.add_module('denselayer%d'%(i), nn.Linear(il, ol)) self.add_module('relu%d'%(i), nn.ReLU(inplace=True)) self.dropout = dropout @@ -89,7 +65,7 @@ def build_net(self,*args, **kwargs): self.adapt_pool = nn.AdaptiveMaxPool2d((base_2,base_2)) il = int(base//(2**(args[1]))) ol = int(args[3]) - print (il,"=>",ol) + logging.info("{} =>> {}".format(il,ol)) self.output = nn.Linear(il, ol) def forward(self, x): @@ -101,3 +77,26 @@ def forward(self, x): return self.output(x) +def get_name(): + return 'topclass_torch' + +def get_model(**args): + if args:logging.debug("receiving arguments {}".format(args)) + conv_layers=args.get('conv_layers',2) + dense_layers=args.get('dense_layers',2) + dropout=args.get('dropout',0.5) + classes=3 + in_channels=5 + try: + from TorchModels import CNN + except: + from .TorchModels import CNN + model = CNN(conv_layers=conv_layers, dense_layers=dense_layers, dropout=dropout, classes=classes, in_channels=in_channels) + return model + +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Integer(1,6, name='conv_layers'), + Integer(1,6, name='dense_layers'), + Real(0.0,1.0, name='dropout') +] diff --git a/nnlo/train/GanModel.py b/nnlo/train/GanModel.py index 93389d3..c939089 100644 --- a/nnlo/train/GanModel.py +++ b/nnlo/train/GanModel.py @@ -1,17 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import print_function from collections import defaultdict try: import cPickle as pickle except ImportError: import pickle -import keras -from keras.models import Model -from keras.layers import Input -from keras import optimizers -from keras.optimizers import RMSprop,SGD +import tensorflow.keras as keras +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Input +from tensorflow.keras import optimizers +from tensorflow.keras.optimizers import RMSprop,SGD #from EcalEnergyGan import generator, discriminator import numpy as np import numpy.core.umath_tests as umath @@ -23,17 +22,16 @@ import logging import keras.backend as K -from keras.models import Model, Sequential -from keras.layers import (Input, Dense, Reshape, Flatten, Lambda, merge, +from tensorflow.keras.models import Model, Sequential +from tensorflow.keras.layers import (Input, Dense, Reshape, Flatten, Lambda, merge, Dropout, BatchNormalization, Activation, Embedding) -from keras.layers.advanced_activations import LeakyReLU -from keras.layers.convolutional import (UpSampling3D, Conv3D, ZeroPadding3D, +from tensorflow.keras.layers.advanced_activations import LeakyReLU +from tensorflow.keras.layers.convolutional import (UpSampling3D, Conv3D, ZeroPadding3D, AveragePooling3D) from ..train.model import MPIModel, ModelBuilder from .optimizer import OptimizerBuilder -import keras kv2 = keras.__version__.startswith('2') def hn(): @@ -46,8 +44,6 @@ def weights(m): _weights_names += [ll.name for ll in layer.weights] _weights = m.get_weights() _disp = [(np.min(s),np.max(s),np.mean(s),np.std(s),s.shape,n) for s,n in zip(_weights,_weights_names)] - #for ii,dd in enumerate(_disp): - # print (ii,dd) def weights_diff( m ,lap=True, init=False,label='', alert=None):#1000.): if (weights_diff.old_weights is None) or init: @@ -62,14 +58,10 @@ def weights_diff( m ,lap=True, init=False,label='', alert=None):#1000.): ## make the diffs _diffs = [np.subtract(a,b) for (a,b) in zip(check_on_weight,and_check_on_weight)] _diffsN = [(np.min(s),np.max(s),np.mean(s),np.std(s),s.shape,n) for s,n in zip(_diffs,_weights_names)] - #print ('\n'.join(['%s'%dd for dd in _diffsN])) for ii,dd in enumerate(_diffsN): if alert: if not any([abs(vv) > alert for vv in dd[:3]]): continue - #print (ii,'WD %s'%label,dd) - #if dd[-2] == (8,): - # print ("\t",_diffs[ii]) if lap: weights_diff.old_weights = m.get_weights() @@ -250,16 +242,11 @@ def get_moments(images, sumsx, sumsy, sumsz, totalE, m): def load_sorted(sorted_path): sorted_files = sorted(glob.glob(sorted_path)) - #print ("found sorterd files",sorted( sorted_files)) energies = [] srt = {} for f in sorted_files: - #print (f) - #energy = int(list(filter(str.isdigit, f))[:-1]) file_name=f[f.find('sorted_'):-1] - #energy = int(''.join(list(filter(str.isdigit, f))[:-1])) energy = int(''.join(list(filter(str.isdigit, file_name))[:-1]))*10 - #print ("found files for energy",energy) energies.append(energy) srtfile = h5py.File(f,'r') srt["events" + str(energy)] = np.array(srtfile.get('ECAL')) @@ -368,22 +355,16 @@ def __init__(self, **args): self.calculate_fom = args.get('calculate_fom',True) if self.tell: - #print ("Generator summary") - #self.generator.summary() - #print ("Discriminator summary") - #self.discriminator.summary() - #print ("Combined summary") - #self.combined.summary() pass - if True: - if self.with_fixed_disc: print ("the batch norm weights are fixed. heavey weight re-assigning") - if self.checkpoint: print ("Checkpointing the model weigths after %d batch, based on the process id"%self.checkpoint) - if self._onepass: print ("Training in one pass") - if self._reversedorder: print ("will train generator first, then discriminator") - if self._heavycheck: print("running heavy check on weight sanity") - if self._show_values: print("showing the input values at each batch") - if self._show_loss: print("showing the loss at each batch") - if self._show_weights: print("showing weights statistics at each batch") + #if True: + # if self.with_fixed_disc: print ("the batch norm weights are fixed. heavey weight re-assigning") + # if self.checkpoint: print ("Checkpointing the model weigths after %d batch, based on the process id"%self.checkpoint) + # if self._onepass: print ("Training in one pass") + # if self._reversedorder: print ("will train generator first, then discriminator") + # if self._heavycheck: print("running heavy check on weight sanity") + # if self._show_values: print("showing the input values at each batch") + # if self._show_loss: print("showing the loss at each batch") + # if self._show_weights: print("showing weights statistics at each batch") MPIModel.__init__(self, models = [ self.discriminator, @@ -472,13 +453,10 @@ def big_assemble_models(self): def ext_assemble_models(self): - #print('[INFO] Building generator') self.generator = generator(self.latent_size, with_bn = self.gen_bn) - #print('[INFO] Building discriminator') self.discriminator = discriminator(discr_drop_out = self.discr_drop_out) if self.with_fixed_disc: self.fixed_discriminator = discriminator(discr_drop_out = self.discr_drop_out, fixed_bn=True) - #print('[INFO] Building combined') latent = Input(shape=(self.latent_size, ), name='combined_z') fake_image = self.generator(latent) if self.with_fixed_disc: @@ -494,7 +472,6 @@ def ext_assemble_models(self): def compile(self, **args): ## args are fully ignored here - #print('[INFO] IN GAN MODEL: COMPILE') if 'optimizer' in args and isinstance(args['optimizer'], OptimizerBuilder): opt_builder = args['optimizer'] else: @@ -512,7 +489,6 @@ def make_opt(**args): else: opt = SGD(lr=lr) - #print ("optimizer for compiling",opt) return opt self.generator.compile( @@ -536,14 +512,11 @@ def make_opt(**args): loss_weights=self.discr_loss_weights ) self.combined.metrics_names = self.discriminator.metrics_names - #print ("disc metrics",self.discriminator.metrics_names) - #print ("comb metrics",self.combined.metrics_names) if hasattr(self, 'calculate_fom'): self.energies, self.g4var = self.prepare_geant4_data() - #print ("compiled") def assemble_models(self): self.ext_assemble_models() @@ -554,44 +527,33 @@ def batch_transform(self, x, y): y_disc_real =y show_values = self._show_values def mm( label, t): - #print (label,np.min(t),np.max(t),np.mean(t),np.std(t),t.shape) pass if self.batch_size is None: ## fix me, maybe self.batch_size = x_disc_real.shape[0] - #print (hn(),"initializing sizes",x_disc_real.shape,[ yy.shape for yy in y]) noise = np.random.normal(0, 1, (self.batch_size, self.latent_size)) sampled_energies = np.random.uniform(0.1, 5,(self.batch_size,1)) generator_ip = np.multiply(sampled_energies, noise) - #if show_values: print ('energies',np.ravel(sampled_energies)[:10]) if show_values: mm('energies',sampled_energies) ratio = np.polyval(root_fit, sampled_energies) - #if show_values: print ('ratios',np.ravel(ratio)[:10]) if show_values: mm('ratios',ratio) ecal_ip = np.multiply(ratio, sampled_energies) - #if show_values: print ('estimated sum cells',np.ravel(ecal_ip)[:10]) if show_values: mm('estimated sum cells',ecal_ip) now = time.mktime(time.gmtime()) - #if self.p_cc>1 and len(self.p_t)%100==0: - # print ("prediction average",np.mean(self.p_t),"[s]' over",len(self.p_t)) generated_images = self.generator.predict(generator_ip) ecal_rip = np.squeeze(np.sum(generated_images, axis=(1, 2, 3))) - #if show_values: print ('generated sum cells',np.ravel(ecal_rip)[:10]) if show_values: mm('generated sum cells',ecal_rip) norm_overflow = False apply_identify = False ## False was intended originally if norm_overflow and np.max( ecal_rip ) > 1000.: - #if show_values: print ("normalizing back") - #ecal_ip = ecal_rip generated_images /= np.max( generated_images ) ecal_rip = np.squeeze(np.sum(generated_images, axis=(1, 2, 3))) - #if show_values: print ('generated sum cells',np.ravel(ecal_rip)[:10]) if show_values: mm('generated sum cells',ecal_rip) elif apply_identify: ecal_ip = ecal_rip @@ -625,7 +587,6 @@ def mm( label, t): c_noise = np.random.normal(0, 1, (2*self.batch_size, self.latent_size)) - ###print ('noise',np.ravel(noise)[:10]) c_sampled_energies = np.random.uniform(0.1, 5, (2*self.batch_size,1 )) c_generator_ip = np.multiply(c_sampled_energies, c_noise) c_ratio = np.polyval(root_fit, c_sampled_energies) @@ -651,9 +612,6 @@ def test_on_batch(self,x, y, sample_weight=None): (X_for_disc,Y_for_disc,X_for_combined,Y_for_combined) = self.batch_transform(x,y) epoch_disc_loss = self.discriminator.test_on_batch(X_for_disc,Y_for_disc) epoch_gen_loss = self.combined.test_on_batch(X_for_combined,Y_for_combined) - #if show_loss: - # print ("test discr loss",epoch_disc_loss) - # print ("test combined loss",epoch_gen_loss) else: ((x_disc_real,re_y),(generated_images, y_disc_fake),(x_comb1,y_comb1),(x_comb2,y_comb2)) = self.batch_transform(x,y) real_disc_loss = self.discriminator.test_on_batch( x_disc_real,re_y ) @@ -663,9 +621,6 @@ def test_on_batch(self,x, y, sample_weight=None): c_loss1= self.combined.test_on_batch( x_comb1,y_comb1 ) c_loss2= self.combined.test_on_batch(x_comb2,y_comb2 ) epoch_gen_loss = [(a + b) / 2 for a, b in zip(c_loss1,c_loss2)] - #if show_loss: - # print ("test discr loss",real_disc_loss,fake_disc_loss) - # print ("test combined loss",c_loss1, c_loss2) @@ -684,7 +639,7 @@ def train_on_batch(self, x, y, def _checkpoint(self): if self.checkpoint and (self.g_cc%self.checkpoint)==0: dest='%s/mpi_generator_%s_%s.h5'%(os.environ.get('GANCHECKPOINTLOC','.'),socket.gethostname(),os.getpid()) - print ("Saving generator to",dest,"at",self.g_cc) + logging.info("Saving generator to {} at {}".format(dest, self.g_cc)) self.generator.save_weights(dest) def _onepass_train_on_batch(self, x, y, @@ -708,8 +663,6 @@ def _train_disc(): self.discriminator.trainable = True now = time.mktime(time.gmtime()) epoch_disc_loss = self.discriminator.train_on_batch(X_for_disc,Y_for_disc) - #if show_loss: - # print (self.d_cc," discr loss",epoch_disc_loss) done = time.mktime(time.gmtime()) if self.d_cc: self.d_t.append( done - now ) @@ -725,13 +678,10 @@ def _train_comb(noT=False): self.discriminator.trainable = False now = time.mktime(time.gmtime()) if noT: - #print ("evaluating the combined model") epoch_gen_loss = self.combined.test_on_batch(X_for_combined,Y_for_combined) else: epoch_gen_loss = self.combined.train_on_batch(X_for_combined,Y_for_combined) - #if show_loss: - # print (self.g_cc,"combined loss",epoch_gen_loss) done = time.mktime(time.gmtime()) if self.g_cc: self.g_t.append( done - now ) @@ -766,12 +716,6 @@ def _train_comb(noT=False): weights( self.combined ) - #if len(self.g_t)>0 and len(self.g_t)%100==0: - # print ("generator average ",np.mean(self.g_t),"[s] over",len(self.g_t)) - - #if len(self.d_t)>0 and len(self.d_t)%100==0: - # print ("discriminator average",np.mean(self.d_t),"[s] over ",len(self.d_t)) - self._checkpoint() return np.asarray([epoch_disc_loss, epoch_gen_loss]) @@ -784,8 +728,6 @@ def _twopass_train_on_batch(self, x, y, show_loss = self._show_loss show_weights = self._show_weights - #if self.d_cc>1 and len(self.d_t)%100==0: - # print ("discriminator average",np.mean(self.d_t),"[s] over ",len(self.d_t)) self.discriminator.trainable = True if self._heavycheck: @@ -822,9 +764,6 @@ def _twopass_train_on_batch(self, x, y, weights_diff( on_weight , label='D-fake') - #if show_loss: - #print (self.discriminator.metrics_names) - #print (self.d_cc,"discr loss",real_batch_loss,fake_batch_loss) epoch_disc_loss = np.asarray([(a + b) / 2 for a, b in zip(real_batch_loss, fake_batch_loss)]) done = time.mktime(time.gmtime()) if self.d_cc: @@ -837,7 +776,6 @@ def _twopass_train_on_batch(self, x, y, weights( self.combined ) if self.g_cc>1 and len(self.g_t)%100==0: - #print ("generator average ",np.mean(self.g_t),"[s] over",len(self.g_t)) now = time.mktime(time.gmtime()) if self.g_cc: @@ -852,9 +790,6 @@ def _twopass_train_on_batch(self, x, y, if show_weights: weights( on_weight ) weights_diff( on_weight , label='C-2') - #if show_loss: - # #print(self.combined.metrics_names) - # print (self.g_cc,"combined loss",c_loss1,c_loss2) epoch_gen_loss = np.asarray([(a + b) / 2 for a, b in zip(c_loss1,c_loss2)]) done = time.mktime(time.gmtime()) if self.g_cc: @@ -871,18 +806,18 @@ def _twopass_train_on_batch(self, x, y, checks = [np.all(np.equal(a,b)) for (a,b) in zip(check_on_weight,and_check_on_weight)] weights_have_changed = not all(checks) weights_are_all_equal = all(checks) - print ('Weights are the same?',checks) + logging.info("Weights are the same? {}".format(str(checks))) if weights_have_changed: for iw,b in enumerate(checks): if not b: - print (iw,"This",check_on_weight[iw].shape) - print (np.ravel(check_on_weight[iw])[:10]) - print (iw,"And that",and_check_on_weight[iw].shape) - print (np.ravel(and_check_on_weight[iw])[:10]) + logging.info("{} This {}".format(iw,str(check_on_weight[iw].shape))) + logging.info("{}".format(np.ravel(check_on_weight[iw])[:10])) + logging.info("{} And that {}".format(iw,and_check_on_weight[iw].shape)) + logging.info("{}".format(np.ravel(and_check_on_weight[iw])[:10])) else: - print ("weights are all identical") - print (np.ravel(and_check_on_weight[1])[:10]) - print (np.ravel(check_on_weight[1])[:10]) + logging.info("weights are all identical") + logging.info("".format(str(np.ravel(and_check_on_weight[1])[:10]))) + logging.info("".format(str(np.ravel(check_on_weight[1])[:10]))) self._checkpoint() @@ -891,7 +826,7 @@ def _twopass_train_on_batch(self, x, y, switching_loss = (1.,1.) if False and not self.recompiled and epoch_disc_loss[0] 0.0001: - print ("#"*30) - print ("swithcing lr",lr,"to", nlr) + logging.info("{}".format("#"*30)) + logging.info("swithcing lr {} to {}".format(lr, nlr)) K.set_value( self.discriminator.optimizer.lr, nlr) - print (K.get_value( self.discriminator.optimizer.lr )) + logging.info("{}".format(K.get_value( self.discriminator.optimizer.lr ))) K.set_value( self.combined.optimizer.lr, nlr) - print (K.get_value( self.combined.optimizer.lr )) - print ("#"*30) + logging.info("{}".format(K.get_value( self.combined.optimizer.lr ))) return np.asarray([epoch_disc_loss, epoch_gen_loss]) @@ -947,7 +881,6 @@ def prepare_geant4_data(self, **args): return energies, var def figure_of_merit(self, **args): - #print (self.histories) delta_loss = np.abs(self.histories['discriminator_model']['val_classification_loss'][-1] - self.histories['combined_model']['val_classification_loss'][-1]) return delta_loss diff --git a/nnlo/train/model.py b/nnlo/train/model.py index 01e6754..1607eda 100644 --- a/nnlo/train/model.py +++ b/nnlo/train/model.py @@ -356,8 +356,10 @@ def test_on_batch(self, x=None, y=None, *args, **kwargs): if self.gpus > 0: x = x.cuda() target = target.cuda() - pred = self.model.forward(Variable(x, volatile=True)) - loss = self.loss(pred, Variable(target, volatile=True)) + import torch + with torch.no_grad(): + pred = self.model.forward(Variable(x)) + loss = self.loss(pred, Variable(target)) l_data = loss.data.numpy() if self.gpus == 0 else loss.data.cpu().numpy() self.metrics = [l_data] if l_data.shape==() else [l_data[0]] if 'acc' in self.metrics_names: # compute the accuracy @@ -434,7 +436,7 @@ def __init__(self, comm, source, custom_objects={}, weights=None): if isinstance(source, six.string_types): if source.endswith('.py'): - module = __import__(source.replace('.py','').replace('/', '.'), fromlist=[None]) + module = __import__('nnlo.'+source.replace('.py','').replace('/', '.'), fromlist=[None]) self.model = module.get_model() self.filename = None else: @@ -443,14 +445,13 @@ def __init__(self, comm, source, else: self.filename = None self.model = source + logging.debug("Get model {0} from file {1}".format(self.model, self.filename)) self.weights = weights self.custom_objects = custom_objects super(ModelTensorFlow, self).__init__(comm) def build_model_aux(self): - import keras.backend as K - if type(self.filename) == list: models = [] self.weights = self.weights.split(',') if self.weights else [None]*len(self.filename) @@ -464,27 +465,26 @@ def build_model_aux(self): def build_model(self, local_session = True): - import keras.backend as K + import tensorflow as tf if local_session: - graph = K.tf.Graph() - session = K.tf.Session(graph=graph, config=K.tf.ConfigProto( + graph = tf.Graph() + session = tf.compat.v1.Session(graph=graph, config=tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, - gpu_options=K.tf.GPUOptions( + gpu_options=tf.compat.v1.GPUOptions( per_process_gpu_memory_fraction=1./self.comm.Get_size()) ) ) with graph.as_default(): with session.as_default(): - import keras.backend as K ret_model = self.build_model_aux() ret_model.session = session ret_model.graph = graph return ret_model else: - K.set_session( K.tf.Session( config=K.tf.ConfigProto( + tf.compat.v1.Session( config=tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, - gpu_options=K.tf.GPUOptions( - per_process_gpu_memory_fraction=1./self.comm.Get_size()) ) ) ) + gpu_options=tf.compat.v1.GPUOptions( + per_process_gpu_memory_fraction=1./self.comm.Get_size()) ) ) return self.build_model_aux() def get_backend_name(self): @@ -497,7 +497,7 @@ def __init__(self, comm, source, super(ModelPytorch,self).__init__(comm) if isinstance(source, six.string_types): if source.endswith('.py'): - module = __import__(source.replace('.py','').replace('/', '.'), fromlist=[None]) + module = __import__('nnlo.'+source.replace('.py','').replace('/', '.'), fromlist=[None]) self.model = module.get_model() self.filename = None else: diff --git a/nnlo/train/optimizer.py b/nnlo/train/optimizer.py index 3053e92..11703c6 100644 --- a/nnlo/train/optimizer.py +++ b/nnlo/train/optimizer.py @@ -647,7 +647,7 @@ def __init__(self, name, config=None, horovod_wrapper=False): self.horovod_wrapper = horovod_wrapper def build(self): - from keras.optimizers import deserialize + from tensorflow.keras.optimizers import deserialize opt_config = {'class_name': self.name, 'config': self.config} opt = deserialize(opt_config) if self.horovod_wrapper: diff --git a/nnlo/util/count_epoch.py b/nnlo/util/count_epoch.py new file mode 100644 index 0000000..77782c9 --- /dev/null +++ b/nnlo/util/count_epoch.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# Rui Zhang 7.2020 +# rui.zhang@cern.ch + +import json +import logging +import sys +import pandas as pd + +def main(): + logging.basicConfig(level = logging.INFO) + filename, rows_list = [], [] + try: + filenames = sys.argv[1:] + except: + logging.fatal('python count_epoch.py Usage [json file name]') + + for filename in filenames: + with open(filename) as f: + data = json.load(f) + name = filename.split('_')[1] + dic = { + 'file': filename, + 'ranks': int(name[name.find('n')+1:name.find('g')]), + 'trainTime': data["train_time"], + } + try: + dic['val_loss'] = data["history"][r"0:0:-"]["val_loss"][-10] + dic['val_accuracy'] = data["history"][r"0:0:-"]["val_accuracy"][-10] + dic['epochs'] = len(data["history"][r"0:0:-"]["val_loss"]) + except: + dic['val_loss'] = data["history"][r"0:-:-"]["val_loss"][-10] + dic['val_accuracy'] = data["history"][r"0:-:-"]["val_accuracy"][-10] + dic['epochs'] = len(data["history"][r"0:-:-"]["val_loss"]) + rows_list.append(dic) + + df = pd.DataFrame(rows_list).sort_values('ranks') + logging.info(f'\n{df}') + +if __name__ == '__main__': + main() diff --git a/nnlo/util/utils.py b/nnlo/util/utils.py index e7a669b..73dfe7f 100644 --- a/nnlo/util/utils.py +++ b/nnlo/util/utils.py @@ -35,7 +35,7 @@ def import_keras(tries=10): try: stderr = sys.stderr sys.stderr = open(os.devnull, 'w') - import keras + import tensorflow.keras as keras sys.stderr = stderr return except ValueError: @@ -51,14 +51,19 @@ def load_model(filename=None, model=None, weights_file=None, custom_objects={}): weights_file: path to HDF5 file containing model weights custom_objects: A Dictionary of custom classes used in the model keyed by name""" import_keras() - from keras.models import model_from_json, clone_model + from tensorflow.keras.models import model_from_json, clone_model if filename is not None: with open( filename ) as arch_f: json_str = arch_f.readline() new_model = model_from_json( json_str, custom_objects=custom_objects) - if model is not None: + logging.info(f"Load model from filename") + elif model is not None: new_model = clone_model(model) - if weights_file is not None: + logging.info(f"Load model from model") + elif weights_file is not None: new_model.load_weights( weights_file ) + logging.info(f"Load model from weights_file") + else: + logging.error(f"Cannot load model: filename, model and weights_file are None") return new_model diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f60a4d6 --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +from setuptools import setup, find_packages + +with open('README.md') as readme_file: + README = readme_file.read() + +with open('HISTORY.md') as history_file: + HISTORY = history_file.read() + +setup_args = dict( + name='nnlo', + version='0.0.7', + entry_points = { + 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main', + 'GetData=nnlo.data.getdata:main', + 'CountEpoch=nnlo.util.count_epoch:main', + ], + }, + description='Distributed Machine Learning tool for High Performance Computing', + long_description_content_type="text/markdown", + long_description=README + '\n\n' + HISTORY, + license='MIT', + packages=find_packages(), + author='NNLO team', + author_email='rui.zhang@cern.ch', + keywords=['Distributed Machine Learning', 'High Performance Computing', 'Hyperparameter optimisation'], + url='https://github.com/chnzhangrui/NNLO', + download_url='https://pypi.org/project/nnlo/' +) + +install_requires = [ + 'scikit-optimize', +] + +if __name__ == '__main__': + setup(**setup_args, install_requires=install_requires, include_package_data=True)