From 6e2e1c81a49c5abfb9b46a81619aff28898da3ad Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 28 Jul 2020 09:21:01 -0400 Subject: [PATCH 01/18] Remove keras dependency and support TF2 --- TrainingDriver.py | 4 ++-- models/Models.py | 35 +++++++++++++++++++---------------- nnlo/train/GanModel.py | 19 +++++++++---------- nnlo/train/model.py | 17 +++++++---------- nnlo/train/optimizer.py | 2 +- nnlo/util/utils.py | 4 ++-- 6 files changed, 40 insertions(+), 41 deletions(-) diff --git a/TrainingDriver.py b/TrainingDriver.py index cc8a654..00a1e8a 100755 --- a/TrainingDriver.py +++ b/TrainingDriver.py @@ -261,7 +261,7 @@ def make_features_labels(m_module, args): os.environ['KERAS_BACKEND'] = 'tensorflow' import_keras() - import keras.backend as K + tf.config.gpu.set_per_process_memory_fraction(0.1) gpu_options=K.tf.GPUOptions( per_process_gpu_memory_fraction=0.1, #was 0.0 allow_growth = True, @@ -269,7 +269,7 @@ def make_features_labels(m_module, args): gpu_options=K.tf.GPUOptions( per_process_gpu_memory_fraction=0.0, allow_growth = True,) - #NTHREADS=(2,1) + NTHREADS=(2,1) NTHREADS=None if NTHREADS is None: K.set_session( K.tf.Session( config=K.tf.ConfigProto( diff --git a/models/Models.py b/models/Models.py index 734f784..9244167 100644 --- a/models/Models.py +++ b/models/Models.py @@ -2,6 +2,8 @@ import sys import logging +from nnlo.util.utils import import_keras +import_keras() def model_function(model_name): """Constructs the Keras model indicated by model_name""" @@ -27,8 +29,8 @@ def make_model(model_name, **args): def make_example_model(): """Example model from keras documentation""" - from keras.models import Sequential - from keras.layers import Dense, Activation + from tensorflow.keras.models import Sequential + from tensorflow.keras.layers import Dense, Activation model = Sequential() model.add(Dense(output_dim=64, input_dim=100)) model.add(Activation("relu")) @@ -37,9 +39,9 @@ def make_example_model(): return model def make_topclass_model(**args): - from keras.models import Sequential, Model - from keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from keras.layers import Convolution2D, MaxPooling2D, Conv2D + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D if args:logging.debug("receiving arguments {}".format(args)) conv_layers=args.get('conv_layers',2) dense_layers=args.get('dense_layers',2) @@ -78,10 +80,10 @@ def make_topclass_model(**args): return model def make_cifar10_model(**args): - from keras.models import Sequential, Model - from keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from keras.layers import Convolution2D, MaxPooling2D, Conv2D - import keras.backend as K + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D + import tensorflow.keras.backend as K if args:logging.debug("receiving arguments {}".format(args)) nb_classes = 10 img_rows, img_cols = 32, 32 @@ -106,7 +108,7 @@ def make_cifar10_model(**args): dense1 = args.get('dense1', 512) dense2 = args.get('dense2', 256) - if K.image_dim_ordering() == 'th': + if tensorflow.keras.backend.image_data_format() == 'channels_first': input_shape = (3, img_rows, img_cols) else: input_shape = (img_rows, img_cols, 3) @@ -143,10 +145,11 @@ def make_cifar10_model(**args): return model def make_mnist_model(**args): - from keras.models import Sequential, Model - from keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from keras.layers import Convolution2D, MaxPooling2D, Conv2D - import keras.backend as K + import tensorflow + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D + import tensorflow.keras.backend as K """MNIST ConvNet from keras/examples/mnist_cnn.py""" #np.random.seed(1337) # for reproducibility if args:logging.debug("receiving arguments {}".format(args)) @@ -164,13 +167,13 @@ def make_mnist_model(**args): dense = args.get('dense', 128) pool_size = (ps,ps) - if K.image_dim_ordering() == 'th': + if tensorflow.keras.backend.image_data_format() == 'channels_first': input_shape = (1, img_rows, img_cols) else: input_shape = (img_rows, img_cols, 1) model = Sequential() model.add(Convolution2D(nb_filters, (ks, ks), - border_mode='valid', + padding='valid', input_shape=input_shape)) model.add(Activation('relu')) model.add(Convolution2D(nb_filters, (ks, ks))) diff --git a/nnlo/train/GanModel.py b/nnlo/train/GanModel.py index 93389d3..b66935f 100644 --- a/nnlo/train/GanModel.py +++ b/nnlo/train/GanModel.py @@ -7,11 +7,11 @@ import cPickle as pickle except ImportError: import pickle -import keras -from keras.models import Model -from keras.layers import Input -from keras import optimizers -from keras.optimizers import RMSprop,SGD +import tensorflow.keras as keras +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Input +from tensorflow.keras import optimizers +from tensorflow.keras.optimizers import RMSprop,SGD #from EcalEnergyGan import generator, discriminator import numpy as np import numpy.core.umath_tests as umath @@ -23,17 +23,16 @@ import logging import keras.backend as K -from keras.models import Model, Sequential -from keras.layers import (Input, Dense, Reshape, Flatten, Lambda, merge, +from tensorflow.keras.models import Model, Sequential +from tensorflow.keras.layers import (Input, Dense, Reshape, Flatten, Lambda, merge, Dropout, BatchNormalization, Activation, Embedding) -from keras.layers.advanced_activations import LeakyReLU -from keras.layers.convolutional import (UpSampling3D, Conv3D, ZeroPadding3D, +from tensorflow.keras.layers.advanced_activations import LeakyReLU +from tensorflow.keras.layers.convolutional import (UpSampling3D, Conv3D, ZeroPadding3D, AveragePooling3D) from ..train.model import MPIModel, ModelBuilder from .optimizer import OptimizerBuilder -import keras kv2 = keras.__version__.startswith('2') def hn(): diff --git a/nnlo/train/model.py b/nnlo/train/model.py index 01e6754..5f76b22 100644 --- a/nnlo/train/model.py +++ b/nnlo/train/model.py @@ -449,8 +449,6 @@ def __init__(self, comm, source, def build_model_aux(self): - import keras.backend as K - if type(self.filename) == list: models = [] self.weights = self.weights.split(',') if self.weights else [None]*len(self.filename) @@ -464,27 +462,26 @@ def build_model_aux(self): def build_model(self, local_session = True): - import keras.backend as K + import tensorflow as tf if local_session: - graph = K.tf.Graph() - session = K.tf.Session(graph=graph, config=K.tf.ConfigProto( + graph = tf.Graph() + session = tf.compat.v1.Session(graph=graph, config=tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, - gpu_options=K.tf.GPUOptions( + gpu_options=tf.compat.v1.GPUOptions( per_process_gpu_memory_fraction=1./self.comm.Get_size()) ) ) with graph.as_default(): with session.as_default(): - import keras.backend as K ret_model = self.build_model_aux() ret_model.session = session ret_model.graph = graph return ret_model else: - K.set_session( K.tf.Session( config=K.tf.ConfigProto( + tf.compat.v1.Session( config=tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, - gpu_options=K.tf.GPUOptions( - per_process_gpu_memory_fraction=1./self.comm.Get_size()) ) ) ) + gpu_options=tf.compat.v1.GPUOptions( + per_process_gpu_memory_fraction=1./self.comm.Get_size()) ) ) return self.build_model_aux() def get_backend_name(self): diff --git a/nnlo/train/optimizer.py b/nnlo/train/optimizer.py index 3053e92..11703c6 100644 --- a/nnlo/train/optimizer.py +++ b/nnlo/train/optimizer.py @@ -647,7 +647,7 @@ def __init__(self, name, config=None, horovod_wrapper=False): self.horovod_wrapper = horovod_wrapper def build(self): - from keras.optimizers import deserialize + from tensorflow.keras.optimizers import deserialize opt_config = {'class_name': self.name, 'config': self.config} opt = deserialize(opt_config) if self.horovod_wrapper: diff --git a/nnlo/util/utils.py b/nnlo/util/utils.py index e7a669b..483d6cc 100644 --- a/nnlo/util/utils.py +++ b/nnlo/util/utils.py @@ -35,7 +35,7 @@ def import_keras(tries=10): try: stderr = sys.stderr sys.stderr = open(os.devnull, 'w') - import keras + import tensorflow.keras as keras sys.stderr = stderr return except ValueError: @@ -51,7 +51,7 @@ def load_model(filename=None, model=None, weights_file=None, custom_objects={}): weights_file: path to HDF5 file containing model weights custom_objects: A Dictionary of custom classes used in the model keyed by name""" import_keras() - from keras.models import model_from_json, clone_model + from tensorflow.keras.models import model_from_json, clone_model if filename is not None: with open( filename ) as arch_f: json_str = arch_f.readline() From 99d370657d123ff26b0f8bd9cdbfee3425b32fbf Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 28 Jul 2020 16:46:01 -0400 Subject: [PATCH 02/18] Publish pipy --- .gitignore | 7 ++- HISTORY.md | 0 MANIFEST.in | 0 __init__.py | 5 ++ examples/example_jedi_torch.py | 2 - models/BuildModel.py | 11 ++-- models/TorchModels.py | 5 +- models/get_3d.py | 12 ++-- nnlo/train/GanModel.py | 112 +++++++-------------------------- setup.py | 30 +++++++++ 10 files changed, 76 insertions(+), 108 deletions(-) create mode 100644 HISTORY.md create mode 100644 MANIFEST.in create mode 100644 __init__.py create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index abe4a4a..4da7146 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.pyc *.swp -*.json *.h5 -*.txt -.DS_Store \ No newline at end of file +.DS_Store +build/ +dist/ +nnlo.egg-info/ diff --git a/HISTORY.md b/HISTORY.md new file mode 100644 index 0000000..e69de29 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e69de29 diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..26f599e --- /dev/null +++ b/__init__.py @@ -0,0 +1,5 @@ +from nnlo.optimize import * +from nnlo.mpi import * +from nnlo.train import * +from nnlo.util import * +from models import * diff --git a/examples/example_jedi_torch.py b/examples/example_jedi_torch.py index df826fc..cb6fa6b 100644 --- a/examples/example_jedi_torch.py +++ b/examples/example_jedi_torch.py @@ -213,7 +213,6 @@ def get_labels(): if __name__ == "__main__": - print("do the data conversion") import glob import h5py import numpy as np @@ -230,5 +229,4 @@ def get_labels(): fo['X'] = X fo['Y'] = Y fo.close() - print(f,"converted") diff --git a/models/BuildModel.py b/models/BuildModel.py index 1529ca7..5e5499b 100644 --- a/models/BuildModel.py +++ b/models/BuildModel.py @@ -5,6 +5,7 @@ import os os.environ['CUDA_VISIBLE_DEVICES']="" import argparse +import logging from Models import make_model @@ -23,7 +24,7 @@ v= float(v) model_args[k] = v if model_args: - print ("passing",model_args,"to the model builder") + logging.info("passing {} to the model builder".format(str(model_args))) model = make_model( model_name ,**model_args) else: model = make_model( model_name) @@ -33,18 +34,18 @@ if not "torch" in model_name: model.summary() model.save_weights( weights_filename, overwrite=True ) - print ("Saved model weights to {0}".format(weights_filename)) + logging.info("Saved model weights to {0}".format(weights_filename)) model_arch = model.to_json() with open( arch_filename, 'w' ) as arch_file: arch_file.write( model_arch ) - print ("Saved model architecture to {0}".format(arch_filename)) + logging.info("Saved model architecture to {0}".format(arch_filename)) else: import torch weights_filename = weights_filename.replace('h5','torch') arch_filename = arch_filename.replace('json','torch') torch.save(model.state_dict(), weights_filename) - print ("Saved model weights to {0}".format(weights_filename)) + logging.info("Saved model weights to {0}".format(weights_filename)) torch.save(model, arch_filename) - print ("Saved model architecture to {0}".format(arch_filename)) + logging.info("Saved model architecture to {0}".format(arch_filename)) diff --git a/models/TorchModels.py b/models/TorchModels.py index fa663e7..2b412c0 100644 --- a/models/TorchModels.py +++ b/models/TorchModels.py @@ -11,6 +11,7 @@ import torchvision.models as models import torch.nn.functional as F import numpy +import logging class MNistNet(nn.Module): def __init__(self, **args): @@ -65,7 +66,7 @@ def __init__(self, dense_layers, dropout ,base): for i in range(dense_layers): il = int(base//(2**i)) ol = int(base//(2**(i+1))) - print (il,"=>",ol) + logging.info("{} =>> {}".format(il,ol)) self.add_module('denselayer%d'%(i), nn.Linear(il, ol)) self.add_module('relu%d'%(i), nn.ReLU(inplace=True)) self.dropout = dropout @@ -89,7 +90,7 @@ def build_net(self,*args, **kwargs): self.adapt_pool = nn.AdaptiveMaxPool2d((base_2,base_2)) il = int(base//(2**(args[1]))) ol = int(args[3]) - print (il,"=>",ol) + logging.info("{} =>> {}".format(il,ol)) self.output = nn.Linear(il, ol) def forward(self, x): diff --git a/models/get_3d.py b/models/get_3d.py index 53a4a7b..dd72838 100644 --- a/models/get_3d.py +++ b/models/get_3d.py @@ -1,16 +1,16 @@ import os import glob +import logging try: import h5py pass except: - print ("hum") + logging.info("import h5py failed") import numpy as np import sys def get_data(datafile): #get data for training - #print ('Loading Data from .....', datafile) f=h5py.File(datafile,'r') y=f.get('target') X=np.array(f.get('ECAL')) @@ -21,9 +21,7 @@ def get_data(datafile): y = y.astype(np.float32) y = y/100. ecal = np.squeeze(np.sum(X, axis=(1, 2, 3))) - print (X.shape) - print (y.shape) - print (ecal.shape) + logging.info("X shape {}; y shape {}; ecal shape {}".format(str(X.shape)), str(y.shape), str(ecal.shape)) f.close() return X, y, ecal @@ -46,7 +44,7 @@ def get_data(datafile): nf = '%s/%s_%s.h5'%( dest,d,f) if os.path.isfile( nf) : continue - print ("processing files",F,"into",nf) + logging.info("processing files {} into {}".format(F,nf)) if X is None: X,y,ecal = get_data(F) o = h5py.File(nf,'w') @@ -61,7 +59,7 @@ def get_data(datafile): nf = '%s/%s_%s_sub%s.h5'%(dest, d,f,sub) if os.path.isfile( nf) : continue - print ("processing files",F,"into",nf) + logging.info("processing files {} into {}".format(F,nf)) if X is None: X,y,ecal = get_data(F) N = X.shape[0] diff --git a/nnlo/train/GanModel.py b/nnlo/train/GanModel.py index b66935f..c939089 100644 --- a/nnlo/train/GanModel.py +++ b/nnlo/train/GanModel.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import print_function from collections import defaultdict try: import cPickle as pickle @@ -45,8 +44,6 @@ def weights(m): _weights_names += [ll.name for ll in layer.weights] _weights = m.get_weights() _disp = [(np.min(s),np.max(s),np.mean(s),np.std(s),s.shape,n) for s,n in zip(_weights,_weights_names)] - #for ii,dd in enumerate(_disp): - # print (ii,dd) def weights_diff( m ,lap=True, init=False,label='', alert=None):#1000.): if (weights_diff.old_weights is None) or init: @@ -61,14 +58,10 @@ def weights_diff( m ,lap=True, init=False,label='', alert=None):#1000.): ## make the diffs _diffs = [np.subtract(a,b) for (a,b) in zip(check_on_weight,and_check_on_weight)] _diffsN = [(np.min(s),np.max(s),np.mean(s),np.std(s),s.shape,n) for s,n in zip(_diffs,_weights_names)] - #print ('\n'.join(['%s'%dd for dd in _diffsN])) for ii,dd in enumerate(_diffsN): if alert: if not any([abs(vv) > alert for vv in dd[:3]]): continue - #print (ii,'WD %s'%label,dd) - #if dd[-2] == (8,): - # print ("\t",_diffs[ii]) if lap: weights_diff.old_weights = m.get_weights() @@ -249,16 +242,11 @@ def get_moments(images, sumsx, sumsy, sumsz, totalE, m): def load_sorted(sorted_path): sorted_files = sorted(glob.glob(sorted_path)) - #print ("found sorterd files",sorted( sorted_files)) energies = [] srt = {} for f in sorted_files: - #print (f) - #energy = int(list(filter(str.isdigit, f))[:-1]) file_name=f[f.find('sorted_'):-1] - #energy = int(''.join(list(filter(str.isdigit, f))[:-1])) energy = int(''.join(list(filter(str.isdigit, file_name))[:-1]))*10 - #print ("found files for energy",energy) energies.append(energy) srtfile = h5py.File(f,'r') srt["events" + str(energy)] = np.array(srtfile.get('ECAL')) @@ -367,22 +355,16 @@ def __init__(self, **args): self.calculate_fom = args.get('calculate_fom',True) if self.tell: - #print ("Generator summary") - #self.generator.summary() - #print ("Discriminator summary") - #self.discriminator.summary() - #print ("Combined summary") - #self.combined.summary() pass - if True: - if self.with_fixed_disc: print ("the batch norm weights are fixed. heavey weight re-assigning") - if self.checkpoint: print ("Checkpointing the model weigths after %d batch, based on the process id"%self.checkpoint) - if self._onepass: print ("Training in one pass") - if self._reversedorder: print ("will train generator first, then discriminator") - if self._heavycheck: print("running heavy check on weight sanity") - if self._show_values: print("showing the input values at each batch") - if self._show_loss: print("showing the loss at each batch") - if self._show_weights: print("showing weights statistics at each batch") + #if True: + # if self.with_fixed_disc: print ("the batch norm weights are fixed. heavey weight re-assigning") + # if self.checkpoint: print ("Checkpointing the model weigths after %d batch, based on the process id"%self.checkpoint) + # if self._onepass: print ("Training in one pass") + # if self._reversedorder: print ("will train generator first, then discriminator") + # if self._heavycheck: print("running heavy check on weight sanity") + # if self._show_values: print("showing the input values at each batch") + # if self._show_loss: print("showing the loss at each batch") + # if self._show_weights: print("showing weights statistics at each batch") MPIModel.__init__(self, models = [ self.discriminator, @@ -471,13 +453,10 @@ def big_assemble_models(self): def ext_assemble_models(self): - #print('[INFO] Building generator') self.generator = generator(self.latent_size, with_bn = self.gen_bn) - #print('[INFO] Building discriminator') self.discriminator = discriminator(discr_drop_out = self.discr_drop_out) if self.with_fixed_disc: self.fixed_discriminator = discriminator(discr_drop_out = self.discr_drop_out, fixed_bn=True) - #print('[INFO] Building combined') latent = Input(shape=(self.latent_size, ), name='combined_z') fake_image = self.generator(latent) if self.with_fixed_disc: @@ -493,7 +472,6 @@ def ext_assemble_models(self): def compile(self, **args): ## args are fully ignored here - #print('[INFO] IN GAN MODEL: COMPILE') if 'optimizer' in args and isinstance(args['optimizer'], OptimizerBuilder): opt_builder = args['optimizer'] else: @@ -511,7 +489,6 @@ def make_opt(**args): else: opt = SGD(lr=lr) - #print ("optimizer for compiling",opt) return opt self.generator.compile( @@ -535,14 +512,11 @@ def make_opt(**args): loss_weights=self.discr_loss_weights ) self.combined.metrics_names = self.discriminator.metrics_names - #print ("disc metrics",self.discriminator.metrics_names) - #print ("comb metrics",self.combined.metrics_names) if hasattr(self, 'calculate_fom'): self.energies, self.g4var = self.prepare_geant4_data() - #print ("compiled") def assemble_models(self): self.ext_assemble_models() @@ -553,44 +527,33 @@ def batch_transform(self, x, y): y_disc_real =y show_values = self._show_values def mm( label, t): - #print (label,np.min(t),np.max(t),np.mean(t),np.std(t),t.shape) pass if self.batch_size is None: ## fix me, maybe self.batch_size = x_disc_real.shape[0] - #print (hn(),"initializing sizes",x_disc_real.shape,[ yy.shape for yy in y]) noise = np.random.normal(0, 1, (self.batch_size, self.latent_size)) sampled_energies = np.random.uniform(0.1, 5,(self.batch_size,1)) generator_ip = np.multiply(sampled_energies, noise) - #if show_values: print ('energies',np.ravel(sampled_energies)[:10]) if show_values: mm('energies',sampled_energies) ratio = np.polyval(root_fit, sampled_energies) - #if show_values: print ('ratios',np.ravel(ratio)[:10]) if show_values: mm('ratios',ratio) ecal_ip = np.multiply(ratio, sampled_energies) - #if show_values: print ('estimated sum cells',np.ravel(ecal_ip)[:10]) if show_values: mm('estimated sum cells',ecal_ip) now = time.mktime(time.gmtime()) - #if self.p_cc>1 and len(self.p_t)%100==0: - # print ("prediction average",np.mean(self.p_t),"[s]' over",len(self.p_t)) generated_images = self.generator.predict(generator_ip) ecal_rip = np.squeeze(np.sum(generated_images, axis=(1, 2, 3))) - #if show_values: print ('generated sum cells',np.ravel(ecal_rip)[:10]) if show_values: mm('generated sum cells',ecal_rip) norm_overflow = False apply_identify = False ## False was intended originally if norm_overflow and np.max( ecal_rip ) > 1000.: - #if show_values: print ("normalizing back") - #ecal_ip = ecal_rip generated_images /= np.max( generated_images ) ecal_rip = np.squeeze(np.sum(generated_images, axis=(1, 2, 3))) - #if show_values: print ('generated sum cells',np.ravel(ecal_rip)[:10]) if show_values: mm('generated sum cells',ecal_rip) elif apply_identify: ecal_ip = ecal_rip @@ -624,7 +587,6 @@ def mm( label, t): c_noise = np.random.normal(0, 1, (2*self.batch_size, self.latent_size)) - ###print ('noise',np.ravel(noise)[:10]) c_sampled_energies = np.random.uniform(0.1, 5, (2*self.batch_size,1 )) c_generator_ip = np.multiply(c_sampled_energies, c_noise) c_ratio = np.polyval(root_fit, c_sampled_energies) @@ -650,9 +612,6 @@ def test_on_batch(self,x, y, sample_weight=None): (X_for_disc,Y_for_disc,X_for_combined,Y_for_combined) = self.batch_transform(x,y) epoch_disc_loss = self.discriminator.test_on_batch(X_for_disc,Y_for_disc) epoch_gen_loss = self.combined.test_on_batch(X_for_combined,Y_for_combined) - #if show_loss: - # print ("test discr loss",epoch_disc_loss) - # print ("test combined loss",epoch_gen_loss) else: ((x_disc_real,re_y),(generated_images, y_disc_fake),(x_comb1,y_comb1),(x_comb2,y_comb2)) = self.batch_transform(x,y) real_disc_loss = self.discriminator.test_on_batch( x_disc_real,re_y ) @@ -662,9 +621,6 @@ def test_on_batch(self,x, y, sample_weight=None): c_loss1= self.combined.test_on_batch( x_comb1,y_comb1 ) c_loss2= self.combined.test_on_batch(x_comb2,y_comb2 ) epoch_gen_loss = [(a + b) / 2 for a, b in zip(c_loss1,c_loss2)] - #if show_loss: - # print ("test discr loss",real_disc_loss,fake_disc_loss) - # print ("test combined loss",c_loss1, c_loss2) @@ -683,7 +639,7 @@ def train_on_batch(self, x, y, def _checkpoint(self): if self.checkpoint and (self.g_cc%self.checkpoint)==0: dest='%s/mpi_generator_%s_%s.h5'%(os.environ.get('GANCHECKPOINTLOC','.'),socket.gethostname(),os.getpid()) - print ("Saving generator to",dest,"at",self.g_cc) + logging.info("Saving generator to {} at {}".format(dest, self.g_cc)) self.generator.save_weights(dest) def _onepass_train_on_batch(self, x, y, @@ -707,8 +663,6 @@ def _train_disc(): self.discriminator.trainable = True now = time.mktime(time.gmtime()) epoch_disc_loss = self.discriminator.train_on_batch(X_for_disc,Y_for_disc) - #if show_loss: - # print (self.d_cc," discr loss",epoch_disc_loss) done = time.mktime(time.gmtime()) if self.d_cc: self.d_t.append( done - now ) @@ -724,13 +678,10 @@ def _train_comb(noT=False): self.discriminator.trainable = False now = time.mktime(time.gmtime()) if noT: - #print ("evaluating the combined model") epoch_gen_loss = self.combined.test_on_batch(X_for_combined,Y_for_combined) else: epoch_gen_loss = self.combined.train_on_batch(X_for_combined,Y_for_combined) - #if show_loss: - # print (self.g_cc,"combined loss",epoch_gen_loss) done = time.mktime(time.gmtime()) if self.g_cc: self.g_t.append( done - now ) @@ -765,12 +716,6 @@ def _train_comb(noT=False): weights( self.combined ) - #if len(self.g_t)>0 and len(self.g_t)%100==0: - # print ("generator average ",np.mean(self.g_t),"[s] over",len(self.g_t)) - - #if len(self.d_t)>0 and len(self.d_t)%100==0: - # print ("discriminator average",np.mean(self.d_t),"[s] over ",len(self.d_t)) - self._checkpoint() return np.asarray([epoch_disc_loss, epoch_gen_loss]) @@ -783,8 +728,6 @@ def _twopass_train_on_batch(self, x, y, show_loss = self._show_loss show_weights = self._show_weights - #if self.d_cc>1 and len(self.d_t)%100==0: - # print ("discriminator average",np.mean(self.d_t),"[s] over ",len(self.d_t)) self.discriminator.trainable = True if self._heavycheck: @@ -821,9 +764,6 @@ def _twopass_train_on_batch(self, x, y, weights_diff( on_weight , label='D-fake') - #if show_loss: - #print (self.discriminator.metrics_names) - #print (self.d_cc,"discr loss",real_batch_loss,fake_batch_loss) epoch_disc_loss = np.asarray([(a + b) / 2 for a, b in zip(real_batch_loss, fake_batch_loss)]) done = time.mktime(time.gmtime()) if self.d_cc: @@ -836,7 +776,6 @@ def _twopass_train_on_batch(self, x, y, weights( self.combined ) if self.g_cc>1 and len(self.g_t)%100==0: - #print ("generator average ",np.mean(self.g_t),"[s] over",len(self.g_t)) now = time.mktime(time.gmtime()) if self.g_cc: @@ -851,9 +790,6 @@ def _twopass_train_on_batch(self, x, y, if show_weights: weights( on_weight ) weights_diff( on_weight , label='C-2') - #if show_loss: - # #print(self.combined.metrics_names) - # print (self.g_cc,"combined loss",c_loss1,c_loss2) epoch_gen_loss = np.asarray([(a + b) / 2 for a, b in zip(c_loss1,c_loss2)]) done = time.mktime(time.gmtime()) if self.g_cc: @@ -870,18 +806,18 @@ def _twopass_train_on_batch(self, x, y, checks = [np.all(np.equal(a,b)) for (a,b) in zip(check_on_weight,and_check_on_weight)] weights_have_changed = not all(checks) weights_are_all_equal = all(checks) - print ('Weights are the same?',checks) + logging.info("Weights are the same? {}".format(str(checks))) if weights_have_changed: for iw,b in enumerate(checks): if not b: - print (iw,"This",check_on_weight[iw].shape) - print (np.ravel(check_on_weight[iw])[:10]) - print (iw,"And that",and_check_on_weight[iw].shape) - print (np.ravel(and_check_on_weight[iw])[:10]) + logging.info("{} This {}".format(iw,str(check_on_weight[iw].shape))) + logging.info("{}".format(np.ravel(check_on_weight[iw])[:10])) + logging.info("{} And that {}".format(iw,and_check_on_weight[iw].shape)) + logging.info("{}".format(np.ravel(and_check_on_weight[iw])[:10])) else: - print ("weights are all identical") - print (np.ravel(and_check_on_weight[1])[:10]) - print (np.ravel(check_on_weight[1])[:10]) + logging.info("weights are all identical") + logging.info("".format(str(np.ravel(and_check_on_weight[1])[:10]))) + logging.info("".format(str(np.ravel(check_on_weight[1])[:10]))) self._checkpoint() @@ -890,7 +826,7 @@ def _twopass_train_on_batch(self, x, y, switching_loss = (1.,1.) if False and not self.recompiled and epoch_disc_loss[0] 0.0001: - print ("#"*30) - print ("swithcing lr",lr,"to", nlr) + logging.info("{}".format("#"*30)) + logging.info("swithcing lr {} to {}".format(lr, nlr)) K.set_value( self.discriminator.optimizer.lr, nlr) - print (K.get_value( self.discriminator.optimizer.lr )) + logging.info("{}".format(K.get_value( self.discriminator.optimizer.lr ))) K.set_value( self.combined.optimizer.lr, nlr) - print (K.get_value( self.combined.optimizer.lr )) - print ("#"*30) + logging.info("{}".format(K.get_value( self.combined.optimizer.lr ))) return np.asarray([epoch_disc_loss, epoch_gen_loss]) @@ -946,7 +881,6 @@ def prepare_geant4_data(self, **args): return energies, var def figure_of_merit(self, **args): - #print (self.histories) delta_loss = np.abs(self.histories['discriminator_model']['val_classification_loss'][-1] - self.histories['combined_model']['val_classification_loss'][-1]) return delta_loss diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..db26e3d --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +from setuptools import setup, find_packages + +with open('README.md') as readme_file: + README = readme_file.read() + +with open('HISTORY.md') as history_file: + HISTORY = history_file.read() + +setup_args = dict( + name='nnlo', + version='0.0.1', + description='Distributed Machine Learning tool for High Performance Computing', + long_description_content_type="text/markdown", + long_description=README + '\n\n' + HISTORY, + license='MIT', + packages=find_packages(), + author='NNLO team', + author_email='rui.zhang@cern.ch', + keywords=['Distributed Machine Learning', 'High Performance Computing', 'Hyperparameter optimisation'], + url='https://github.com/chnzhangrui/NNLO', + download_url='https://pypi.org/project/nnlo/' +) + +install_requires = [ + 'tensorflow>=2', + 'mpi4py>3' +] + +if __name__ == '__main__': + setup(**setup_args, install_requires=install_requires, include_package_data=True) From 3b727a9c0537dae0b6a0d36f1e193b6565b3791b Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Wed, 29 Jul 2020 05:04:45 -0400 Subject: [PATCH 03/18] Fix TF2 compatibility --- .gitignore | 1 + TrainingDriver.py | 51 ++++++++++--------- __init__.py | 5 -- examples/example_mnist_torch.py | 5 -- nnlo/__init__.py | 6 +++ {models => nnlo/examples}/__init__.py | 0 .../examples}/example_hls4mlgru.py | 0 .../examples}/example_jedi_torch.py | 0 {examples => nnlo/examples}/example_mnist.py | 2 +- nnlo/examples/example_mnist_torch.py | 5 ++ {models => nnlo/models}/BuildModel.py | 0 {models => nnlo/models}/Models.py | 0 {models => nnlo/models}/TorchModels.py | 0 nnlo/models/__init__.py | 0 {models => nnlo/models}/get_3d.py | 0 {models => nnlo/models}/get_cifar10.py | 0 {models => nnlo/models}/get_mnist.py | 0 {models => nnlo/models}/get_topclass.py | 0 nnlo/train/model.py | 2 +- setup.py | 4 +- 20 files changed, 43 insertions(+), 38 deletions(-) delete mode 100644 __init__.py delete mode 100644 examples/example_mnist_torch.py rename {models => nnlo/examples}/__init__.py (100%) rename {examples => nnlo/examples}/example_hls4mlgru.py (100%) rename {examples => nnlo/examples}/example_jedi_torch.py (100%) rename {examples => nnlo/examples}/example_mnist.py (95%) create mode 100644 nnlo/examples/example_mnist_torch.py rename {models => nnlo/models}/BuildModel.py (100%) rename {models => nnlo/models}/Models.py (100%) rename {models => nnlo/models}/TorchModels.py (100%) create mode 100644 nnlo/models/__init__.py rename {models => nnlo/models}/get_3d.py (100%) rename {models => nnlo/models}/get_cifar10.py (100%) rename {models => nnlo/models}/get_mnist.py (100%) rename {models => nnlo/models}/get_topclass.py (100%) diff --git a/.gitignore b/.gitignore index 4da7146..6d47be2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ build/ dist/ nnlo.egg-info/ +publish.sh diff --git a/TrainingDriver.py b/TrainingDriver.py index 00a1e8a..8a6ecbf 100755 --- a/TrainingDriver.py +++ b/TrainingDriver.py @@ -234,7 +234,8 @@ def make_features_labels(m_module, args): if 'torch' in args.model: a_backend = 'torch' - m_module = __import__(args.model.replace('.py','').replace('/', '.'), fromlist=[None]) if '.py' in args.model else None + print('nnlo.'+args.model.replace('.py','').replace('/', '.')) + m_module = __import__('nnlo.'+args.model.replace('.py','').replace('/', '.'), fromlist=[None]) if '.py' in args.model else None (features_name, labels_name) = make_features_labels(m_module, args) (train_list, val_list) = make_train_val_lists(m_module, args) comm = MPI.COMM_WORLD.Dup() @@ -260,29 +261,33 @@ def make_features_labels(m_module, args): logging.debug("Using TensorFlow") os.environ['KERAS_BACKEND'] = 'tensorflow' + import tensorflow as tf import_keras() - tf.config.gpu.set_per_process_memory_fraction(0.1) - gpu_options=K.tf.GPUOptions( - per_process_gpu_memory_fraction=0.1, #was 0.0 - allow_growth = True, - visible_device_list = device[-1] if 'gpu' in device else '') - gpu_options=K.tf.GPUOptions( - per_process_gpu_memory_fraction=0.0, - allow_growth = True,) - NTHREADS=(2,1) - NTHREADS=None - if NTHREADS is None: - K.set_session( K.tf.Session( config=K.tf.ConfigProto( - allow_soft_placement=True, log_device_placement=False, - gpu_options=gpu_options - ) ) ) - else: - K.set_session( K.tf.Session( config=K.tf.ConfigProto( - allow_soft_placement=True, log_device_placement=False, - gpu_options=gpu_options, - intra_op_parallelism_threads=NTHREADS[0], - inter_op_parallelism_threads=NTHREADS[1], - ) ) ) + #gpu_options=K.tf.GPUOptions( + # per_process_gpu_memory_fraction=0.1, #was 0.0 + # allow_growth = True, + # visible_device_list = device[-1] if 'gpu' in device else '') + #gpu_options=K.tf.GPUOptions( + # per_process_gpu_memory_fraction=0.0, + # allow_growth = True,) + gpu_devices = tf.config.experimental.list_physical_devices('GPU') + for device in gpu_devices: + tf.config.experimental.set_memory_growth(device, True) + + #NTHREADS=(2,1) + #NTHREADS=None + #if NTHREADS is None: + # K.set_session( K.tf.Session( config=K.tf.ConfigProto( + # allow_soft_placement=True, log_device_placement=False, + # gpu_options=gpu_options + # ) ) ) + #else: + # K.set_session( K.tf.Session( config=K.tf.ConfigProto( + # allow_soft_placement=True, log_device_placement=False, + # gpu_options=gpu_options, + # intra_op_parallelism_threads=NTHREADS[0], + # inter_op_parallelism_threads=NTHREADS[1], + # ) ) ) model_builder = ModelTensorFlow( comm, source=args.model, weights=model_weights) diff --git a/__init__.py b/__init__.py deleted file mode 100644 index 26f599e..0000000 --- a/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from nnlo.optimize import * -from nnlo.mpi import * -from nnlo.train import * -from nnlo.util import * -from models import * diff --git a/examples/example_mnist_torch.py b/examples/example_mnist_torch.py deleted file mode 100644 index af916c9..0000000 --- a/examples/example_mnist_torch.py +++ /dev/null @@ -1,5 +0,0 @@ -from models.Models import make_mnist_torch_model -from examples.example_mnist import * - -get_model = make_mnist_torch_model - diff --git a/nnlo/__init__.py b/nnlo/__init__.py index e69de29..05b3a3d 100644 --- a/nnlo/__init__.py +++ b/nnlo/__init__.py @@ -0,0 +1,6 @@ +from nnlo.optimize import * +from nnlo.mpi import * +from nnlo.train import * +from nnlo.util import * +from nnlo.models import * +from nnlo.examples import * diff --git a/models/__init__.py b/nnlo/examples/__init__.py similarity index 100% rename from models/__init__.py rename to nnlo/examples/__init__.py diff --git a/examples/example_hls4mlgru.py b/nnlo/examples/example_hls4mlgru.py similarity index 100% rename from examples/example_hls4mlgru.py rename to nnlo/examples/example_hls4mlgru.py diff --git a/examples/example_jedi_torch.py b/nnlo/examples/example_jedi_torch.py similarity index 100% rename from examples/example_jedi_torch.py rename to nnlo/examples/example_jedi_torch.py diff --git a/examples/example_mnist.py b/nnlo/examples/example_mnist.py similarity index 95% rename from examples/example_mnist.py rename to nnlo/examples/example_mnist.py index a6b11bd..80a9519 100644 --- a/examples/example_mnist.py +++ b/nnlo/examples/example_mnist.py @@ -1,4 +1,4 @@ -from models.Models import make_mnist_model +from nnlo.models.Models import make_mnist_model get_model = make_mnist_model def get_name(): diff --git a/nnlo/examples/example_mnist_torch.py b/nnlo/examples/example_mnist_torch.py new file mode 100644 index 0000000..8e7fddf --- /dev/null +++ b/nnlo/examples/example_mnist_torch.py @@ -0,0 +1,5 @@ +from nnlo.models.Models import make_mnist_torch_model +from nnlo.examples.example_mnist import * + +get_model = make_mnist_torch_model + diff --git a/models/BuildModel.py b/nnlo/models/BuildModel.py similarity index 100% rename from models/BuildModel.py rename to nnlo/models/BuildModel.py diff --git a/models/Models.py b/nnlo/models/Models.py similarity index 100% rename from models/Models.py rename to nnlo/models/Models.py diff --git a/models/TorchModels.py b/nnlo/models/TorchModels.py similarity index 100% rename from models/TorchModels.py rename to nnlo/models/TorchModels.py diff --git a/nnlo/models/__init__.py b/nnlo/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/get_3d.py b/nnlo/models/get_3d.py similarity index 100% rename from models/get_3d.py rename to nnlo/models/get_3d.py diff --git a/models/get_cifar10.py b/nnlo/models/get_cifar10.py similarity index 100% rename from models/get_cifar10.py rename to nnlo/models/get_cifar10.py diff --git a/models/get_mnist.py b/nnlo/models/get_mnist.py similarity index 100% rename from models/get_mnist.py rename to nnlo/models/get_mnist.py diff --git a/models/get_topclass.py b/nnlo/models/get_topclass.py similarity index 100% rename from models/get_topclass.py rename to nnlo/models/get_topclass.py diff --git a/nnlo/train/model.py b/nnlo/train/model.py index 5f76b22..b24c6b3 100644 --- a/nnlo/train/model.py +++ b/nnlo/train/model.py @@ -434,7 +434,7 @@ def __init__(self, comm, source, custom_objects={}, weights=None): if isinstance(source, six.string_types): if source.endswith('.py'): - module = __import__(source.replace('.py','').replace('/', '.'), fromlist=[None]) + module = __import__('nnlo.'+source.replace('.py','').replace('/', '.'), fromlist=[None]) self.model = module.get_model() self.filename = None else: diff --git a/setup.py b/setup.py index db26e3d..32de486 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup_args = dict( name='nnlo', - version='0.0.1', + version='0.0.3', description='Distributed Machine Learning tool for High Performance Computing', long_description_content_type="text/markdown", long_description=README + '\n\n' + HISTORY, @@ -22,8 +22,6 @@ ) install_requires = [ - 'tensorflow>=2', - 'mpi4py>3' ] if __name__ == '__main__': From d0a673cbfb2703790f870397ccc46023361a4888 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Thu, 30 Jul 2020 17:13:47 -0400 Subject: [PATCH 04/18] Move drivers to nnlo.driver --- nnlo/__init__.py | 1 + MPIGDriver.py => nnlo/driver/MPIGDriver.py | 5 +++- .../driver/OptimizationDriver.py | 7 +++-- .../driver/TrainingDriver.py | 28 +++++++++++++------ nnlo/driver/__init__.py | 0 setup.py | 3 ++ 6 files changed, 32 insertions(+), 12 deletions(-) rename MPIGDriver.py => nnlo/driver/MPIGDriver.py (99%) rename OptimizationDriver.py => nnlo/driver/OptimizationDriver.py (99%) rename TrainingDriver.py => nnlo/driver/TrainingDriver.py (93%) create mode 100644 nnlo/driver/__init__.py diff --git a/nnlo/__init__.py b/nnlo/__init__.py index 05b3a3d..458a243 100644 --- a/nnlo/__init__.py +++ b/nnlo/__init__.py @@ -4,3 +4,4 @@ from nnlo.util import * from nnlo.models import * from nnlo.examples import * +from nnlo.driver import * diff --git a/MPIGDriver.py b/nnlo/driver/MPIGDriver.py similarity index 99% rename from MPIGDriver.py rename to nnlo/driver/MPIGDriver.py index 52d5e14..cd39854 100755 --- a/MPIGDriver.py +++ b/nnlo/driver/MPIGDriver.py @@ -21,7 +21,7 @@ import socket -if __name__ == '__main__': +def main(): from TrainingDriver import add_loader_options parser = argparse.ArgumentParser() parser.add_argument('--verbose',help='display metrics for each training batch',action='store_true') @@ -162,3 +162,6 @@ comm.Barrier() logging.info("Terminating") + +if __name__ == '__main__': + main() diff --git a/OptimizationDriver.py b/nnlo/driver/OptimizationDriver.py similarity index 99% rename from OptimizationDriver.py rename to nnlo/driver/OptimizationDriver.py index 8b6359b..a140971 100755 --- a/OptimizationDriver.py +++ b/nnlo/driver/OptimizationDriver.py @@ -105,9 +105,7 @@ def make_opt_parser(): return parser - -if __name__ == '__main__': - +def main(): logging.info("Process is on {}".format(socket.gethostname())) parser = make_opt_parser() args = parser.parse_args() @@ -323,3 +321,6 @@ def make_opt_parser(): checkpoint=args.checkpoint, checkpoint_interval=args.checkpoint_interval) block.run() + +if __name__ == '__main__': + main() diff --git a/TrainingDriver.py b/nnlo/driver/TrainingDriver.py similarity index 93% rename from TrainingDriver.py rename to nnlo/driver/TrainingDriver.py index 8a6ecbf..9d9c0c4 100755 --- a/TrainingDriver.py +++ b/nnlo/driver/TrainingDriver.py @@ -12,6 +12,7 @@ from mpi4py import MPI from time import time,sleep +import importlib from nnlo.mpi.manager import MPIManager, get_device from nnlo.train.algo import Algo @@ -27,6 +28,7 @@ def add_log_option(parser): # logging configuration parser.add_argument('--log-file', default=None, dest='log_file', help='log file to write, in additon to output stream') parser.add_argument('--log-level', default='info', dest='log_level', help='log level (debug, info, warn, error)') + parser.add_argument('--output', default='./', dest='output', help='output folder') def add_master_option(parser): parser.add_argument('--master-gpu',help='master process should get a gpu', @@ -89,7 +91,7 @@ def add_train_options(parser): parser.add_argument('--thread_validation', help='run a single process', action='store_true') # model arguments - parser.add_argument('--model', help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') + parser.add_argument('--model', choices=['mnist'], help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') parser.add_argument('--trial-name', help='descriptive name for trial', default='train', dest='trial_name') @@ -225,7 +227,7 @@ def make_features_labels(m_module, args): labels_name = m_module.get_labels() if m_module is not None and hasattr(m_module,"get_labels") else args.labels_name return (features_name, labels_name) -if __name__ == '__main__': +def main(): parser = make_train_parser() args = parser.parse_args() initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level) @@ -234,8 +236,14 @@ def make_features_labels(m_module, args): if 'torch' in args.model: a_backend = 'torch' - print('nnlo.'+args.model.replace('.py','').replace('/', '.')) - m_module = __import__('nnlo.'+args.model.replace('.py','').replace('/', '.'), fromlist=[None]) if '.py' in args.model else None + m_module, model_source = None, None + if args.model == 'mnist': + try: + m_module = importlib.import_module(f'nnlo.examples.example_mnist') + model_source = 'examples/example_mnist.py' + except Exception as e: + logging.fatal(e) + (features_name, labels_name) = make_features_labels(m_module, args) (train_list, val_list) = make_train_val_lists(m_module, args) comm = MPI.COMM_WORLD.Dup() @@ -256,13 +264,14 @@ def make_features_labels(m_module, args): if use_torch: logging.debug("Using pytorch") - model_builder = ModelPytorch(comm, source=args.model, weights=model_weights, gpus=1 if 'gpu' in device else 0) + model_builder = ModelPytorch(comm, source=model_source, weights=model_weights, gpus=1 if 'gpu' in device else 0) else: logging.debug("Using TensorFlow") os.environ['KERAS_BACKEND'] = 'tensorflow' import tensorflow as tf import_keras() + #tf.config.gpu.set_per_process_memory_fraction(0.1) #gpu_options=K.tf.GPUOptions( # per_process_gpu_memory_fraction=0.1, #was 0.0 # allow_growth = True, @@ -290,7 +299,7 @@ def make_features_labels(m_module, args): # ) ) ) - model_builder = ModelTensorFlow( comm, source=args.model, weights=model_weights) + model_builder = ModelTensorFlow( comm, source=model_source, weights=model_weights) data = make_loader(args, features_name, labels_name, train_list) @@ -318,8 +327,8 @@ def make_features_labels(m_module, args): else: model_name = os.path.basename(args.model).replace('.json','') - json_name = '_'.join([model_name,args.trial_name,"history.json"]) - tl_json_name = '_'.join([model_name,args.trial_name,"timeline.json"]) + json_name = args.output + '/' + '_'.join([model_name,args.trial_name,"history.json"]) + tl_json_name = args.output + '/' + '_'.join([model_name,args.trial_name,"timeline.json"]) # Process 0 launches the training procedure if comm.Get_rank() == 0: @@ -338,3 +347,6 @@ def make_features_labels(m_module, args): comm.barrier() logging.info("Terminating") if args.timeline: Timeline.collect(clean=True, file_name=tl_json_name) + +if __name__ == '__main__': + main() diff --git a/nnlo/driver/__init__.py b/nnlo/driver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index 32de486..678311d 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,9 @@ setup_args = dict( name='nnlo', version='0.0.3', + entry_points = { + 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main'], + }, description='Distributed Machine Learning tool for High Performance Computing', long_description_content_type="text/markdown", long_description=README + '\n\n' + HISTORY, From 64915aebf7e8e248df3503858a41f51da039c798 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Fri, 31 Jul 2020 10:14:54 -0400 Subject: [PATCH 05/18] Command line tools for data download - GetData [mnist | cifar10] --- .gitignore | 1 + nnlo/models/get_3d.py | 107 ++++++++++++++++++------------------ nnlo/models/get_cifar10.py | 94 ++++++++++++++++--------------- nnlo/models/get_mnist.py | 94 ++++++++++++++++--------------- nnlo/models/get_topclass.py | 44 ++++++++------- nnlo/models/getdata.py | 20 +++++++ setup.py | 3 +- 7 files changed, 200 insertions(+), 163 deletions(-) create mode 100644 nnlo/models/getdata.py diff --git a/.gitignore b/.gitignore index 6d47be2..a52bb86 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ build/ dist/ nnlo.egg-info/ publish.sh +.local/ diff --git a/nnlo/models/get_3d.py b/nnlo/models/get_3d.py index dd72838..bd8b4fa 100644 --- a/nnlo/models/get_3d.py +++ b/nnlo/models/get_3d.py @@ -26,64 +26,67 @@ def get_data(datafile): f.close() return X, y, ecal -dest='/data/shared/3DGAN/' -import socket -host = os.environ.get('HOST', os.environ.get('HOSTNAME',socket.gethostname())) -if 'daint' in host: - dest='/scratch/snx3000/vlimant/3DGAN/' -if 'titan' in host: - dest='/ccs/proj/csc291/DATA/3DGAN/' - -sub_split = int(sys.argv[1]) if len(sys.argv)>1 else 1 - -for F in glob.glob('/bigdata/shared/LCD/NewV1/*scan/*.h5'): - _,d,f = F.rsplit('/',2) - if not 'Ele' in d: continue - X = None - if sub_split==1: - nf = '%s/%s_%s.h5'%( dest,d,f) - if os.path.isfile( nf) : - continue - logging.info("processing files {} into {}".format(F,nf)) - if X is None: - X,y,ecal = get_data(F) - o = h5py.File(nf,'w') - o['X'] = X - o.create_group("y") - o['y']['a'] = np.ones(y.shape) - o['y']['b'] = y - o['y']['c'] = ecal - o.close() - else: - for sub in range(sub_split): - nf = '%s/%s_%s_sub%s.h5'%(dest, d,f,sub) +def main(): + dest='/data/shared/3DGAN/' + import socket + host = os.environ.get('HOST', os.environ.get('HOSTNAME',socket.gethostname())) + if 'daint' in host: + dest='/scratch/snx3000/vlimant/3DGAN/' + if 'titan' in host: + dest='/ccs/proj/csc291/DATA/3DGAN/' + + sub_split = int(sys.argv[1]) if len(sys.argv)>1 else 1 + + for F in glob.glob('/bigdata/shared/LCD/NewV1/*scan/*.h5'): + _,d,f = F.rsplit('/',2) + if not 'Ele' in d: continue + X = None + if sub_split==1: + nf = '%s/%s_%s.h5'%( dest,d,f) if os.path.isfile( nf) : continue logging.info("processing files {} into {}".format(F,nf)) if X is None: X,y,ecal = get_data(F) - N = X.shape[0] - splits = [i*N/sub_split for i in range(sub_split)]+[-1] o = h5py.File(nf,'w') - o['X'] = X[splits[sub]:splits[sub+1],...] + o['X'] = X o.create_group("y") - o['y']['a'] = np.ones(y[splits[sub]:splits[sub+1],...].shape) - o['y']['b'] = y[splits[sub]:splits[sub+1],...] - o['y']['c'] = ecal[splits[sub]:splits[sub+1],...] - o.close() - X = None - -if sub_split == 1: - sub_files = lambda f:not 'sub' in f -else: - sub_files = lambda f:'sub' in f - -open('train_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:-4]))) -open('test_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-4:]))) + o['y']['a'] = np.ones(y.shape) + o['y']['b'] = y + o['y']['c'] = ecal + o.close() + else: + for sub in range(sub_split): + nf = '%s/%s_%s_sub%s.h5'%(dest, d,f,sub) + if os.path.isfile( nf) : + continue + logging.info("processing files {} into {}".format(F,nf)) + if X is None: + X,y,ecal = get_data(F) + N = X.shape[0] + splits = [i*N/sub_split for i in range(sub_split)]+[-1] + o = h5py.File(nf,'w') + o['X'] = X[splits[sub]:splits[sub+1],...] + o.create_group("y") + o['y']['a'] = np.ones(y[splits[sub]:splits[sub+1],...].shape) + o['y']['b'] = y[splits[sub]:splits[sub+1],...] + o['y']['c'] = ecal[splits[sub]:splits[sub+1],...] + o.close() + X = None -open('train_small_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:-4]))) -open('test_small_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-4:]))) - -open('train_7_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:7]))) -open('test_1_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-1:]))) + if sub_split == 1: + sub_files = lambda f:not 'sub' in f + else: + sub_files = lambda f:'sub' in f + + open('train_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:-4]))) + open('test_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-4:]))) + + open('train_small_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:-4]))) + open('test_small_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-4:]))) + + open('train_7_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[:7]))) + open('test_1_3d.list','w').write( '\n'.join(filter(sub_files,glob.glob(dest+'/*.h5')[-1:]))) +if __name__ == '__main__': + main() diff --git a/nnlo/models/get_cifar10.py b/nnlo/models/get_cifar10.py index bebcb8a..13b5a8d 100644 --- a/nnlo/models/get_cifar10.py +++ b/nnlo/models/get_cifar10.py @@ -2,52 +2,56 @@ # each piece in a separate h5 file. from numpy import array_split -from keras.datasets import cifar10 -from keras.utils import np_utils -from keras import backend as K +from tensorflow.keras.datasets import cifar10 +from tensorflow.python.keras.utils import np_utils +from tensorflow.python.keras import backend as K import h5py import sys -(X_train, Y_train), (X_test, Y_test) = cifar10.load_data() +def main(argv): + (X_train, Y_train), (X_test, Y_test) = cifar10.load_data() + + img_rows = 32 + img_cols = 32 + if K.image_data_format() == 'channels_first': + X_train = X_train.reshape(X_train.shape[0], 3, img_rows, img_cols) + X_test = X_test.reshape(X_test.shape[0], 3, img_rows, img_cols) + input_shape = (3, img_rows, img_cols) + else: + X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 3) + X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 3) + input_shape = (img_rows, img_cols, 3) + + num_train_pieces = int(argv[1]) if len(argv)>1 else 24 + num_test_pieces = int(argv[2]) if len(argv)>1 else 4 + split_X_train = [ X.astype('float32') / 255 for X in array_split(X_train, num_train_pieces) ] + split_Y_train = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_train, num_train_pieces) ] + split_X_test = [ X.astype('float32') / 255 for X in array_split(X_test, num_test_pieces) ] + split_Y_test = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_test, num_test_pieces) ] + + train_list = [] + for i in range(num_train_pieces): + train_name = "cifar10_train_%d.h5" % i + train_list.append(train_name+"\n") + train_outfile = h5py.File( train_name, 'w' ) + train_outfile.create_dataset( "features", data=split_X_train[i] ) + train_outfile.create_dataset( "labels", data=split_Y_train[i] ) + train_outfile.close() + with open('train_cifar10.list', 'w') as train_list_file: + for f in train_list: + train_list_file.write(f) + + test_list = [] + for i in range(num_test_pieces): + test_name = "cifar10_test_%d.h5" % i + test_list.append(test_name+"\n") + test_outfile = h5py.File( test_name, 'w' ) + test_outfile.create_dataset( "features", data=split_X_test[i] ) + test_outfile.create_dataset( "labels", data=split_Y_test[i] ) + test_outfile.close() + with open('test_cifar10.list', 'w') as test_list_file: + for f in test_list: + test_list_file.write(f) -img_rows = 32 -img_cols = 32 -if K.image_dim_ordering() == 'th': - X_train = X_train.reshape(X_train.shape[0], 3, img_rows, img_cols) - X_test = X_test.reshape(X_test.shape[0], 3, img_rows, img_cols) - input_shape = (3, img_rows, img_cols) -else: - X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 3) - X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 3) - input_shape = (img_rows, img_cols, 3) - -num_train_pieces = int(sys.argv[1]) if len(sys.argv)>1 else 24 -num_test_pieces = int(sys.argv[2]) if len(sys.argv)>1 else 4 -split_X_train = [ X.astype('float32') / 255 for X in array_split(X_train, num_train_pieces) ] -split_Y_train = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_train, num_train_pieces) ] -split_X_test = [ X.astype('float32') / 255 for X in array_split(X_test, num_test_pieces) ] -split_Y_test = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_test, num_test_pieces) ] - -train_list = [] -for i in range(num_train_pieces): - train_name = "cifar10_train_%d.h5" % i - train_list.append(train_name+"\n") - train_outfile = h5py.File( train_name, 'w' ) - train_outfile.create_dataset( "features", data=split_X_train[i] ) - train_outfile.create_dataset( "labels", data=split_Y_train[i] ) - train_outfile.close() -with open('train_cifar10.list', 'w') as train_list_file: - for f in train_list: - train_list_file.write(f) - -test_list = [] -for i in range(num_test_pieces): - test_name = "cifar10_test_%d.h5" % i - test_list.append(test_name+"\n") - test_outfile = h5py.File( test_name, 'w' ) - test_outfile.create_dataset( "features", data=split_X_test[i] ) - test_outfile.create_dataset( "labels", data=split_Y_test[i] ) - test_outfile.close() -with open('test_cifar10.list', 'w') as test_list_file: - for f in test_list: - test_list_file.write(f) +if __name__ == '__main__': + main() diff --git a/nnlo/models/get_mnist.py b/nnlo/models/get_mnist.py index b5b2a14..86cab29 100644 --- a/nnlo/models/get_mnist.py +++ b/nnlo/models/get_mnist.py @@ -2,52 +2,56 @@ # each piece in a separate h5 file. from numpy import array_split -from keras.datasets import mnist -from keras.utils import np_utils -from keras import backend as K +from tensorflow.keras.datasets import mnist +from tensorflow.python.keras.utils import np_utils +from tensorflow.python.keras import backend as K import h5py import sys -(X_train, Y_train), (X_test, Y_test) = mnist.load_data() +def main(argv): + (X_train, Y_train), (X_test, Y_test) = mnist.load_data() + + img_rows = 28 + img_cols = 28 + if K.image_data_format() == 'channels_first': + X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols) + X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols) + input_shape = (1, img_rows, img_cols) + else: + X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1) + X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1) + input_shape = (img_rows, img_cols, 1) + + num_train_pieces = int(argv[1]) if len(argv)>1 else 24 + num_test_pieces = int(argv[2]) if len(argv)>1 else 4 + split_X_train = [ X.astype('float32') / 255 for X in array_split(X_train, num_train_pieces) ] + split_Y_train = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_train, num_train_pieces) ] + split_X_test = [ X.astype('float32') / 255 for X in array_split(X_test, num_test_pieces) ] + split_Y_test = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_test, num_test_pieces) ] + + train_list = [] + for i in range(num_train_pieces): + train_name = "mnist_train_%d.h5" % i + train_list.append(train_name+"\n") + train_outfile = h5py.File( train_name, 'w' ) + train_outfile.create_dataset( "features", data=split_X_train[i] ) + train_outfile.create_dataset( "labels", data=split_Y_train[i] ) + train_outfile.close() + with open('train_mnist.list', 'w') as train_list_file: + for f in train_list: + train_list_file.write(f) + + test_list = [] + for i in range(num_test_pieces): + test_name = "mnist_test_%d.h5" % i + test_list.append(test_name+"\n") + test_outfile = h5py.File( test_name, 'w' ) + test_outfile.create_dataset( "features", data=split_X_test[i] ) + test_outfile.create_dataset( "labels", data=split_Y_test[i] ) + test_outfile.close() + with open('test_mnist.list', 'w') as test_list_file: + for f in test_list: + test_list_file.write(f) -img_rows = 28 -img_cols = 28 -if K.image_dim_ordering() == 'th': - X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols) - X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1) - X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1) -input_shape = (img_rows, img_cols, 1) - -num_train_pieces = int(sys.argv[1]) if len(sys.argv)>1 else 24 -num_test_pieces = int(sys.argv[2]) if len(sys.argv)>1 else 4 -split_X_train = [ X.astype('float32') / 255 for X in array_split(X_train, num_train_pieces) ] -split_Y_train = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_train, num_train_pieces) ] -split_X_test = [ X.astype('float32') / 255 for X in array_split(X_test, num_test_pieces) ] -split_Y_test = [ np_utils.to_categorical(Y,10) for Y in array_split(Y_test, num_test_pieces) ] - -train_list = [] -for i in range(num_train_pieces): - train_name = "mnist_train_%d.h5" % i - train_list.append(train_name+"\n") - train_outfile = h5py.File( train_name, 'w' ) - train_outfile.create_dataset( "features", data=split_X_train[i] ) - train_outfile.create_dataset( "labels", data=split_Y_train[i] ) - train_outfile.close() -with open('train_mnist.list', 'w') as train_list_file: - for f in train_list: - train_list_file.write(f) - -test_list = [] -for i in range(num_test_pieces): - test_name = "mnist_test_%d.h5" % i - test_list.append(test_name+"\n") - test_outfile = h5py.File( test_name, 'w' ) - test_outfile.create_dataset( "features", data=split_X_test[i] ) - test_outfile.create_dataset( "labels", data=split_Y_test[i] ) - test_outfile.close() -with open('test_mnist.list', 'w') as test_list_file: - for f in test_list: - test_list_file.write(f) +if __name__ == '__main__': + main() diff --git a/nnlo/models/get_topclass.py b/nnlo/models/get_topclass.py index f3e1998..b30c167 100644 --- a/nnlo/models/get_topclass.py +++ b/nnlo/models/get_topclass.py @@ -2,24 +2,28 @@ import glob import sys -dest='/bigdata/shared/LCDJets_Abstract_IsoLep_lt_20' -import socket -host = os.environ.get('HOST', os.environ.get('HOSTNAME',socket.gethostname())) -if 'titan' in host: - dest='/ccs/proj/csc291/DATA/LCDJets_Abstract_IsoLep_lt_20' -train = glob.glob(dest+'/train/*.h5') -test = glob.glob(dest+'/val/*.h5') +def main(): + dest='/bigdata/shared/LCDJets_Abstract_IsoLep_lt_20' + import socket + host = os.environ.get('HOST', os.environ.get('HOSTNAME',socket.gethostname())) + if 'titan' in host: + dest='/ccs/proj/csc291/DATA/LCDJets_Abstract_IsoLep_lt_20' + train = glob.glob(dest+'/train/*.h5') + test = glob.glob(dest+'/val/*.h5') + + N=10 + Nt=N/5 + if len(sys.argv)>=1: + a = sys.argv[1] + if a.isdigit(): + N = int(a) + Nt=N/5 + else: + N,Nt = map(int, a.split(',')) + + + open('train_topclass.list','w').write( '\n'.join(sorted( train[:N] ))) + open('test_topclass.list','w').write( '\n'.join(sorted( test[:Nt] ))) -N=10 -Nt=N/5 -if len(sys.argv)>=1: - a = sys.argv[1] - if a.isdigit(): - N = int(a) - Nt=N/5 - else: - N,Nt = map(int, a.split(',')) - - -open('train_topclass.list','w').write( '\n'.join(sorted( train[:N] ))) -open('test_topclass.list','w').write( '\n'.join(sorted( test[:Nt] ))) +if __name__ == '__main__': + main() diff --git a/nnlo/models/getdata.py b/nnlo/models/getdata.py new file mode 100644 index 0000000..113cbb5 --- /dev/null +++ b/nnlo/models/getdata.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# Rui Zhang 7.2020 +# rui.zhang@cern.ch + +import sys + +def main(): + command = sys.argv[1] + argv = sys.argv[2:] + if command.lower() == 'mnist': + from nnlo.models.get_mnist import main + main(argv) + elif command.lower() == 'cifar10': + from nnlo.models.get_cifar10 import main + main(argv) + else: + raise RuntimeError('Unknown command: {}'.format(command)) + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index 678311d..faba807 100644 --- a/setup.py +++ b/setup.py @@ -8,9 +8,10 @@ setup_args = dict( name='nnlo', - version='0.0.3', + version='0.0.4', entry_points = { 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main'], + 'console_scripts': ['GetData=nnlo.models.getdata:main'], }, description='Distributed Machine Learning tool for High Performance Computing', long_description_content_type="text/markdown", From 210dfd363ccaf9a075e702c04c8606c4ba941bfe Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Mon, 3 Aug 2020 09:28:10 -0400 Subject: [PATCH 06/18] Benchmark cifar10 --- nnlo/driver/TrainingDriver.py | 8 +++++++- nnlo/examples/example_cifar10.py | 34 ++++++++++++++++++++++++++++++++ nnlo/models/Models.py | 5 ++--- nnlo/models/get_cifar10.py | 5 +++-- nnlo/models/get_mnist.py | 5 +++-- nnlo/train/model.py | 1 + nnlo/util/count_epoch.py | 27 +++++++++++++++++++++++++ setup.py | 6 ++++-- 8 files changed, 81 insertions(+), 10 deletions(-) create mode 100644 nnlo/examples/example_cifar10.py create mode 100644 nnlo/util/count_epoch.py diff --git a/nnlo/driver/TrainingDriver.py b/nnlo/driver/TrainingDriver.py index 9d9c0c4..c4cff2c 100755 --- a/nnlo/driver/TrainingDriver.py +++ b/nnlo/driver/TrainingDriver.py @@ -91,7 +91,7 @@ def add_train_options(parser): parser.add_argument('--thread_validation', help='run a single process', action='store_true') # model arguments - parser.add_argument('--model', choices=['mnist'], help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') + parser.add_argument('--model', choices=['mnist', 'cifar10'], help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') parser.add_argument('--trial-name', help='descriptive name for trial', default='train', dest='trial_name') @@ -243,6 +243,12 @@ def main(): model_source = 'examples/example_mnist.py' except Exception as e: logging.fatal(e) + elif args.model == 'cifar10': + try: + m_module = importlib.import_module(f'nnlo.examples.example_cifar10') + model_source = 'examples/example_cifar10.py' + except Exception as e: + logging.fatal(e) (features_name, labels_name) = make_features_labels(m_module, args) (train_list, val_list) = make_train_val_lists(m_module, args) diff --git a/nnlo/examples/example_cifar10.py b/nnlo/examples/example_cifar10.py new file mode 100644 index 0000000..7a1f294 --- /dev/null +++ b/nnlo/examples/example_cifar10.py @@ -0,0 +1,34 @@ +from nnlo.models.Models import make_cifar10_model + +get_model = make_cifar10_model +def get_name(): + return 'cifar10' + +def get_all(): + import socket,os,glob + host = os.environ.get('HOST',os.environ.get('HOSTNAME',socket.gethostname())) + + all_list = glob.glob('mnist_*.h5') + if not all_list: + all_list = glob.glob('mnist_*.h5') + return all_list + +def get_train(): + all_list = get_all() + l = int( len(all_list)*0.70) + train_list = all_list[:l] + return train_list + +def get_val(): + all_list = get_all() + l = int( len(all_list)*0.70) + val_list = all_list[l:] + return val_list + +def get_features(): + #return ('features', lambda x: x) ##example of data adaptor + return 'features' + +def get_labels(): + return 'labels' + diff --git a/nnlo/models/Models.py b/nnlo/models/Models.py index 9244167..bfe6bc1 100644 --- a/nnlo/models/Models.py +++ b/nnlo/models/Models.py @@ -108,7 +108,7 @@ def make_cifar10_model(**args): dense1 = args.get('dense1', 512) dense2 = args.get('dense2', 256) - if tensorflow.keras.backend.image_data_format() == 'channels_first': + if K.image_data_format() == 'channels_first': input_shape = (3, img_rows, img_cols) else: input_shape = (img_rows, img_cols, 3) @@ -145,7 +145,6 @@ def make_cifar10_model(**args): return model def make_mnist_model(**args): - import tensorflow from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D @@ -167,7 +166,7 @@ def make_mnist_model(**args): dense = args.get('dense', 128) pool_size = (ps,ps) - if tensorflow.keras.backend.image_data_format() == 'channels_first': + if K.image_data_format() == 'channels_first': input_shape = (1, img_rows, img_cols) else: input_shape = (img_rows, img_cols, 1) diff --git a/nnlo/models/get_cifar10.py b/nnlo/models/get_cifar10.py index 13b5a8d..77ad4c1 100644 --- a/nnlo/models/get_cifar10.py +++ b/nnlo/models/get_cifar10.py @@ -7,6 +7,7 @@ from tensorflow.python.keras import backend as K import h5py import sys +import os def main(argv): (X_train, Y_train), (X_test, Y_test) = cifar10.load_data() @@ -37,7 +38,7 @@ def main(argv): train_outfile.create_dataset( "features", data=split_X_train[i] ) train_outfile.create_dataset( "labels", data=split_Y_train[i] ) train_outfile.close() - with open('train_cifar10.list', 'w') as train_list_file: + with open(f'{os.getcwd()}/train_cifar10.list', 'w') as train_list_file: for f in train_list: train_list_file.write(f) @@ -49,7 +50,7 @@ def main(argv): test_outfile.create_dataset( "features", data=split_X_test[i] ) test_outfile.create_dataset( "labels", data=split_Y_test[i] ) test_outfile.close() - with open('test_cifar10.list', 'w') as test_list_file: + with open(f'{os.getcwd()}/test_cifar10.list', 'w') as test_list_file: for f in test_list: test_list_file.write(f) diff --git a/nnlo/models/get_mnist.py b/nnlo/models/get_mnist.py index 86cab29..416a80a 100644 --- a/nnlo/models/get_mnist.py +++ b/nnlo/models/get_mnist.py @@ -7,6 +7,7 @@ from tensorflow.python.keras import backend as K import h5py import sys +import os def main(argv): (X_train, Y_train), (X_test, Y_test) = mnist.load_data() @@ -37,7 +38,7 @@ def main(argv): train_outfile.create_dataset( "features", data=split_X_train[i] ) train_outfile.create_dataset( "labels", data=split_Y_train[i] ) train_outfile.close() - with open('train_mnist.list', 'w') as train_list_file: + with open(f'{os.getcwd()}/train_mnist.list', 'w') as train_list_file: for f in train_list: train_list_file.write(f) @@ -49,7 +50,7 @@ def main(argv): test_outfile.create_dataset( "features", data=split_X_test[i] ) test_outfile.create_dataset( "labels", data=split_Y_test[i] ) test_outfile.close() - with open('test_mnist.list', 'w') as test_list_file: + with open(f'{os.getcwd()}/test_mnist.list', 'w') as test_list_file: for f in test_list: test_list_file.write(f) diff --git a/nnlo/train/model.py b/nnlo/train/model.py index b24c6b3..f313fd5 100644 --- a/nnlo/train/model.py +++ b/nnlo/train/model.py @@ -443,6 +443,7 @@ def __init__(self, comm, source, else: self.filename = None self.model = source + logging.debug("Get model {0} from file {1}".format(self.model, self.filename)) self.weights = weights self.custom_objects = custom_objects super(ModelTensorFlow, self).__init__(comm) diff --git a/nnlo/util/count_epoch.py b/nnlo/util/count_epoch.py new file mode 100644 index 0000000..1f854b5 --- /dev/null +++ b/nnlo/util/count_epoch.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# Rui Zhang 7.2020 +# rui.zhang@cern.ch + +import json +import logging +import sys + +def main(): + logging.basicConfig(level = logging.INFO) + filenames = [] + try: + filenames = sys.argv[1:] + except: + logging.fatal('python count_epoch.py Usage [json file name]') + + for filename in filenames: + with open(filename) as f: + data = json.load(f) + + try: + logging.info(f'{filename} epochs {len(data["history"]["0:0:-"]["val_loss"])} val_loss {data["history"]["0:0:-"]["val_loss"][-10]}') + except: + logging.info(f'{filename} epochs {len(data["history"]["0:-:-"]["val_loss"])} val_loss {data["history"]["0:-:-"]["val_loss"][-10]}') + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index faba807..2c26d15 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,10 @@ name='nnlo', version='0.0.4', entry_points = { - 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main'], - 'console_scripts': ['GetData=nnlo.models.getdata:main'], + 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main', + 'GetData=nnlo.models.getdata:main', + 'CountEpoch=nnlo.util.count_epoch:main', + ], }, description='Distributed Machine Learning tool for High Performance Computing', long_description_content_type="text/markdown", From 192788c2dab656771b86ee9e1be72f8032890289 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Mon, 3 Aug 2020 16:33:52 -0400 Subject: [PATCH 07/18] Update CountEpoch --- nnlo/util/count_epoch.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/nnlo/util/count_epoch.py b/nnlo/util/count_epoch.py index 1f854b5..77782c9 100644 --- a/nnlo/util/count_epoch.py +++ b/nnlo/util/count_epoch.py @@ -5,10 +5,11 @@ import json import logging import sys +import pandas as pd def main(): logging.basicConfig(level = logging.INFO) - filenames = [] + filename, rows_list = [], [] try: filenames = sys.argv[1:] except: @@ -17,11 +18,24 @@ def main(): for filename in filenames: with open(filename) as f: data = json.load(f) - + name = filename.split('_')[1] + dic = { + 'file': filename, + 'ranks': int(name[name.find('n')+1:name.find('g')]), + 'trainTime': data["train_time"], + } try: - logging.info(f'{filename} epochs {len(data["history"]["0:0:-"]["val_loss"])} val_loss {data["history"]["0:0:-"]["val_loss"][-10]}') + dic['val_loss'] = data["history"][r"0:0:-"]["val_loss"][-10] + dic['val_accuracy'] = data["history"][r"0:0:-"]["val_accuracy"][-10] + dic['epochs'] = len(data["history"][r"0:0:-"]["val_loss"]) except: - logging.info(f'{filename} epochs {len(data["history"]["0:-:-"]["val_loss"])} val_loss {data["history"]["0:-:-"]["val_loss"][-10]}') + dic['val_loss'] = data["history"][r"0:-:-"]["val_loss"][-10] + dic['val_accuracy'] = data["history"][r"0:-:-"]["val_accuracy"][-10] + dic['epochs'] = len(data["history"][r"0:-:-"]["val_loss"]) + rows_list.append(dic) + + df = pd.DataFrame(rows_list).sort_values('ranks') + logging.info(f'\n{df}') if __name__ == '__main__': main() From 027071247ae1ac7a72be393411f64728ea7ac1c1 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Mon, 3 Aug 2020 16:41:17 -0400 Subject: [PATCH 08/18] Update README --- README.md | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 88445f8..168cc4c 100644 --- a/README.md +++ b/README.md @@ -10,36 +10,38 @@ The original package was implemented by [Dustin Anderson](https://github.com/dua Test with the MNIST dataset, with keras+tensorflow ``` -git clone https://github.com/vlimant/NNLO.git +pip install nnlo cd NNLO ``` Example with mnist provided in a python file ``` -python3 models/get_mnist.py -mpirun -np 3 --tag-output python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 3 -mpirun -np 3 --tag-output python3 TrainingDriver.py --model examples/example_mnist_torch.py --loss categorical_crossentropy --epochs 3 +GetData mnist +mpirun -np 3 TrainingDriver --model mnist --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_mnist.list --val_data /path/to/test_mnist.list +mpirun -np 3 python TrainingDriver.py --model examples/example_mnist_torch.py --loss categorical_crossentropy --epochs 3 +jsrun -n 3 -g 1 TrainingDriver --model mnist --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_mnist.list --val_data /path/to/test_mnist.list ``` Example with the cifar10 with model json ``` -python3 models/BuildModel.py cifar10 +GetData cifar10 python3 models/get_cifar10.py -mpirun -np 3 --tag-output python3 TrainingDriver.py --model cifar10_arch.json --train train_cifar10.list --val test_cifar10.list --loss categorical_crossentropy --epochs 5 +mpirun -np 3 TrainingDriver --model cifar10 --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_cifar10.list --val_data /path/to/test_cifar10.list +jsrun -n 3 -g 1 TrainingDriver --model cifar10 --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_cifar10.list --val_data /path/to/test_cifar10.list ``` Example of training mnist with 2 workers, each with 2 process per Horovod ring ``` -mpirun -np 5 --tag-output python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 3 --n-processes 2 +mpirun -np 5 python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 3 --n-processes 2 ``` Example of training mnist with early stopping ``` -mpirun -np 3 --tag-output python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 10000 --early "val_loss,~<,4" +mpirun -np 3 python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 10000 --early "val_loss,~<,4" ``` Example of training with a fixed target ``` -mpirun -np 3 --tag-output python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 10000 --target-metric "val_acc,>,0.97" +mpirun -np 3 python3 TrainingDriver.py --model examples/example_mnist.py --loss categorical_crossentropy --epochs 10000 --target-metric "val_acc,>,0.97" ``` ## GAN Examples (experimental) From 555a45f0be94055d152e00902fa1d6a19436e12e Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Mon, 3 Aug 2020 17:23:19 -0400 Subject: [PATCH 09/18] Extract MNIST TF model from model/Model.py --- nnlo/examples/example_mnist.py | 2 +- nnlo/models/Models.py | 98 +++++++++++++++++----------------- nnlo/models/model_mnist_tf.py | 56 +++++++++++++++++++ nnlo/util/utils.py | 9 +++- setup.py | 3 +- 5 files changed, 115 insertions(+), 53 deletions(-) create mode 100644 nnlo/models/model_mnist_tf.py diff --git a/nnlo/examples/example_mnist.py b/nnlo/examples/example_mnist.py index 80a9519..cdf8424 100644 --- a/nnlo/examples/example_mnist.py +++ b/nnlo/examples/example_mnist.py @@ -1,4 +1,4 @@ -from nnlo.models.Models import make_mnist_model +from nnlo.models.model_mnist_tf import make_mnist_model get_model = make_mnist_model def get_name(): diff --git a/nnlo/models/Models.py b/nnlo/models/Models.py index bfe6bc1..1a36ef6 100644 --- a/nnlo/models/Models.py +++ b/nnlo/models/Models.py @@ -144,48 +144,48 @@ def make_cifar10_model(**args): return model -def make_mnist_model(**args): - from tensorflow.keras.models import Sequential, Model - from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D - import tensorflow.keras.backend as K - """MNIST ConvNet from keras/examples/mnist_cnn.py""" - #np.random.seed(1337) # for reproducibility - if args:logging.debug("receiving arguments {}".format(args)) - nb_classes = 10 - # input image dimensions - img_rows, img_cols = 28, 28 - # number of convolutional filters to use - nb_filters = args.get('nb_filters',32) - # size of pooling area for max pooling - ps = args.get('pool_size',2) - - # convolution kernel size - ks = args.get('kernel_size',3) - do = args.get('dropout', 0.25) - dense = args.get('dense', 128) - - pool_size = (ps,ps) - if K.image_data_format() == 'channels_first': - input_shape = (1, img_rows, img_cols) - else: - input_shape = (img_rows, img_cols, 1) - model = Sequential() - model.add(Convolution2D(nb_filters, (ks, ks), - padding='valid', - input_shape=input_shape)) - model.add(Activation('relu')) - model.add(Convolution2D(nb_filters, (ks, ks))) - model.add(Activation('relu')) - model.add(MaxPooling2D(pool_size=pool_size)) - model.add(Dropout(do)) - model.add(Flatten()) - model.add(Dense(dense)) - model.add(Activation('relu')) - model.add(Dropout(do)) - model.add(Dense(nb_classes)) - model.add(Activation('softmax')) - return model +#def make_mnist_model(**args): +# from tensorflow.keras.models import Sequential, Model +# from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute +# from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D +# import tensorflow.keras.backend as K +# """MNIST ConvNet from keras/examples/mnist_cnn.py""" +# #np.random.seed(1337) # for reproducibility +# if args:logging.debug("receiving arguments {}".format(args)) +# nb_classes = 10 +# # input image dimensions +# img_rows, img_cols = 28, 28 +# # number of convolutional filters to use +# nb_filters = args.get('nb_filters',32) +# # size of pooling area for max pooling +# ps = args.get('pool_size',2) +# +# # convolution kernel size +# ks = args.get('kernel_size',3) +# do = args.get('dropout', 0.25) +# dense = args.get('dense', 128) +# +# pool_size = (ps,ps) +# if K.image_data_format() == 'channels_first': +# input_shape = (1, img_rows, img_cols) +# else: +# input_shape = (img_rows, img_cols, 1) +# model = Sequential() +# model.add(Convolution2D(nb_filters, (ks, ks), +# padding='valid', +# input_shape=input_shape)) +# model.add(Activation('relu')) +# model.add(Convolution2D(nb_filters, (ks, ks))) +# model.add(Activation('relu')) +# model.add(MaxPooling2D(pool_size=pool_size)) +# model.add(Dropout(do)) +# model.add(Flatten()) +# model.add(Dense(dense)) +# model.add(Activation('relu')) +# model.add(Dropout(do)) +# model.add(Dense(nb_classes)) +# model.add(Activation('softmax')) +# return model def make_mnist_torch_model(**args): if args:logging.debug("receiving arguments {}".format(args)) @@ -212,13 +212,13 @@ def make_topclass_torch_model(**args): try: from skopt.space import Real, Integer, Categorical - make_mnist_model.parameter_range = [ - Integer(10,50, name='nb_filters'), - Integer(2,10, name='pool_size'), - Integer(2,10, name='kernel_size'), - Integer(50,200, name='dense'), - Real(0.0, 1.0, name='dropout') - ] +# make_mnist_model.parameter_range = [ +# Integer(10,50, name='nb_filters'), +# Integer(2,10, name='pool_size'), +# Integer(2,10, name='kernel_size'), +# Integer(50,200, name='dense'), +# Real(0.0, 1.0, name='dropout') +# ] make_mnist_torch_model.parameter_range = [ Integer(2,10, name='kernel_size'), Integer(50,200, name='dense'), diff --git a/nnlo/models/model_mnist_tf.py b/nnlo/models/model_mnist_tf.py new file mode 100644 index 0000000..4ff013a --- /dev/null +++ b/nnlo/models/model_mnist_tf.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def make_mnist_model(**args): + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D + import tensorflow.keras.backend as K + """MNIST ConvNet from keras/examples/mnist_cnn.py""" + #np.random.seed(1337) # for reproducibility + if args:logging.debug("receiving arguments {}".format(args)) + nb_classes = 10 + # input image dimensions + img_rows, img_cols = 28, 28 + # number of convolutional filters to use + nb_filters = args.get('nb_filters',32) + # size of pooling area for max pooling + ps = args.get('pool_size',2) + + # convolution kernel size + ks = args.get('kernel_size',3) + do = args.get('dropout', 0.25) + dense = args.get('dense', 128) + + pool_size = (ps,ps) + if K.image_data_format() == 'channels_first': + input_shape = (1, img_rows, img_cols) + else: + input_shape = (img_rows, img_cols, 1) + model = Sequential() + model.add(Convolution2D(nb_filters, (ks, ks), + padding='valid', + input_shape=input_shape)) + model.add(Activation('relu')) + model.add(Convolution2D(nb_filters, (ks, ks))) + model.add(Activation('relu')) + model.add(MaxPooling2D(pool_size=pool_size)) + model.add(Dropout(do)) + model.add(Flatten()) + model.add(Dense(dense)) + model.add(Activation('relu')) + model.add(Dropout(do)) + model.add(Dense(nb_classes)) + model.add(Activation('softmax')) + return model + +from skopt.space import Real, Integer, Categorical +make_mnist_model.parameter_range = [ + Integer(10,50, name='nb_filters'), + Integer(2,10, name='pool_size'), + Integer(2,10, name='kernel_size'), + Integer(50,200, name='dense'), + Real(0.0, 1.0, name='dropout') +] + diff --git a/nnlo/util/utils.py b/nnlo/util/utils.py index 483d6cc..73dfe7f 100644 --- a/nnlo/util/utils.py +++ b/nnlo/util/utils.py @@ -56,9 +56,14 @@ def load_model(filename=None, model=None, weights_file=None, custom_objects={}): with open( filename ) as arch_f: json_str = arch_f.readline() new_model = model_from_json( json_str, custom_objects=custom_objects) - if model is not None: + logging.info(f"Load model from filename") + elif model is not None: new_model = clone_model(model) - if weights_file is not None: + logging.info(f"Load model from model") + elif weights_file is not None: new_model.load_weights( weights_file ) + logging.info(f"Load model from weights_file") + else: + logging.error(f"Cannot load model: filename, model and weights_file are None") return new_model diff --git a/setup.py b/setup.py index 2c26d15..33fdc92 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup_args = dict( name='nnlo', - version='0.0.4', + version='0.0.5', entry_points = { 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main', 'GetData=nnlo.models.getdata:main', @@ -28,6 +28,7 @@ ) install_requires = [ + 'scikit-optimize', ] if __name__ == '__main__': From 09c766a2908bd78915b71ce061a5492a11c524e6 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Mon, 3 Aug 2020 17:27:01 -0400 Subject: [PATCH 10/18] Extract cifar10 TF model from model/Model.py --- nnlo/examples/example_cifar10.py | 2 +- nnlo/models/Models.py | 148 +++++++++++++++---------------- nnlo/models/model_cifar10_tf.py | 79 +++++++++++++++++ 3 files changed, 154 insertions(+), 75 deletions(-) create mode 100644 nnlo/models/model_cifar10_tf.py diff --git a/nnlo/examples/example_cifar10.py b/nnlo/examples/example_cifar10.py index 7a1f294..094234f 100644 --- a/nnlo/examples/example_cifar10.py +++ b/nnlo/examples/example_cifar10.py @@ -1,4 +1,4 @@ -from nnlo.models.Models import make_cifar10_model +from nnlo.models.model_cifar10_tf import make_cifar10_model get_model = make_cifar10_model def get_name(): diff --git a/nnlo/models/Models.py b/nnlo/models/Models.py index 1a36ef6..d587b06 100644 --- a/nnlo/models/Models.py +++ b/nnlo/models/Models.py @@ -79,71 +79,71 @@ def make_topclass_model(**args): #model.summary() return model -def make_cifar10_model(**args): - from tensorflow.keras.models import Sequential, Model - from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D - import tensorflow.keras.backend as K - if args:logging.debug("receiving arguments {}".format(args)) - nb_classes = 10 - img_rows, img_cols = 32, 32 - - # use 1 kernel size for all convolutional layers - ks = args.get('kernel_size', 3) - - # tune the number of filters for each convolution layer - nb_filters1 = args.get('nb_filters1', 48) - nb_filters2 = args.get('nb_filters2', 96) - nb_filters3 = args.get('nb_filters3', 192) - - # tune the pool size once - ps = args.get('pool_size', 2) - pool_size = (ps,ps) - - # tune the dropout rates independently - do4 = args.get('dropout1', 0.25) - do5 = args.get('dropout2', 0.5) - - # tune the dense layers independently - dense1 = args.get('dense1', 512) - dense2 = args.get('dense2', 256) - - if K.image_data_format() == 'channels_first': - input_shape = (3, img_rows, img_cols) - else: - input_shape = (img_rows, img_cols, 3) - - #act = 'sigmoid' - act = 'relu' - - i = Input( input_shape) - l = Conv2D(nb_filters1,( ks, ks), padding='same', activation = act)(i) - l = MaxPooling2D(pool_size=pool_size)(l) - #l = Dropout(do1)(l) - - l = Conv2D(nb_filters2, (ks, ks), padding='same',activation=act)(l) - #l = Conv2D(nb_filters2, (ks, ks))(l) - l = MaxPooling2D(pool_size=pool_size)(l) - #l = Dropout(do2)(l) - - l = Conv2D(nb_filters3, (ks, ks), padding='same',activation=act)(l) - #l = Conv2D(nb_filters3, (ks, ks))(l) - l = MaxPooling2D(pool_size=pool_size)(l) - #l = Dropout(do3)(l) - - l = Flatten()(l) - l = Dense(dense1,activation=act)(l) - l = Dropout(do4)(l) - l = Dense(dense2,activation=act)(l) - l =Dropout(do5)(l) - - o = Dense(nb_classes, activation='softmax')(l) - - model = Model(inputs=i, outputs=o) - #model.summary() - - return model - +#def make_cifar10_model(**args): +# from tensorflow.keras.models import Sequential, Model +# from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute +# from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D +# import tensorflow.keras.backend as K +# if args:logging.debug("receiving arguments {}".format(args)) +# nb_classes = 10 +# img_rows, img_cols = 32, 32 +# +# # use 1 kernel size for all convolutional layers +# ks = args.get('kernel_size', 3) +# +# # tune the number of filters for each convolution layer +# nb_filters1 = args.get('nb_filters1', 48) +# nb_filters2 = args.get('nb_filters2', 96) +# nb_filters3 = args.get('nb_filters3', 192) +# +# # tune the pool size once +# ps = args.get('pool_size', 2) +# pool_size = (ps,ps) +# +# # tune the dropout rates independently +# do4 = args.get('dropout1', 0.25) +# do5 = args.get('dropout2', 0.5) +# +# # tune the dense layers independently +# dense1 = args.get('dense1', 512) +# dense2 = args.get('dense2', 256) +# +# if K.image_data_format() == 'channels_first': +# input_shape = (3, img_rows, img_cols) +# else: +# input_shape = (img_rows, img_cols, 3) +# +# #act = 'sigmoid' +# act = 'relu' +# +# i = Input( input_shape) +# l = Conv2D(nb_filters1,( ks, ks), padding='same', activation = act)(i) +# l = MaxPooling2D(pool_size=pool_size)(l) +# #l = Dropout(do1)(l) +# +# l = Conv2D(nb_filters2, (ks, ks), padding='same',activation=act)(l) +# #l = Conv2D(nb_filters2, (ks, ks))(l) +# l = MaxPooling2D(pool_size=pool_size)(l) +# #l = Dropout(do2)(l) +# +# l = Conv2D(nb_filters3, (ks, ks), padding='same',activation=act)(l) +# #l = Conv2D(nb_filters3, (ks, ks))(l) +# l = MaxPooling2D(pool_size=pool_size)(l) +# #l = Dropout(do3)(l) +# +# l = Flatten()(l) +# l = Dense(dense1,activation=act)(l) +# l = Dropout(do4)(l) +# l = Dense(dense2,activation=act)(l) +# l =Dropout(do5)(l) +# +# o = Dense(nb_classes, activation='softmax')(l) +# +# model = Model(inputs=i, outputs=o) +# #model.summary() +# +# return model +# #def make_mnist_model(**args): # from tensorflow.keras.models import Sequential, Model # from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute @@ -235,15 +235,15 @@ def make_topclass_torch_model(**args): Integer(1,6, name='dense_layers'), Real(0.0,1.0, name='dropout') ] - make_cifar10_model.parameter_range = [ - Integer(10,300, name='nb_filters1'), - Integer(10,300, name='nb_filters2'), - Integer(10,300, name='nb_filters3'), - Integer(50,1000, name='dense1'), - Integer(50,1000, name='dense2'), - Real(0.0, 1.0, name='dropout1'), - Real(0.0, 1.0, name='dropout2') - ] +# make_cifar10_model.parameter_range = [ +# Integer(10,300, name='nb_filters1'), +# Integer(10,300, name='nb_filters2'), +# Integer(10,300, name='nb_filters3'), +# Integer(50,1000, name='dense1'), +# Integer(50,1000, name='dense2'), +# Real(0.0, 1.0, name='dropout1'), +# Real(0.0, 1.0, name='dropout2') +# ] except: pass diff --git a/nnlo/models/model_cifar10_tf.py b/nnlo/models/model_cifar10_tf.py new file mode 100644 index 0000000..1f20e16 --- /dev/null +++ b/nnlo/models/model_cifar10_tf.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def make_cifar10_model(**args): + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D + import tensorflow.keras.backend as K + if args:logging.debug("receiving arguments {}".format(args)) + nb_classes = 10 + img_rows, img_cols = 32, 32 + + # use 1 kernel size for all convolutional layers + ks = args.get('kernel_size', 3) + + # tune the number of filters for each convolution layer + nb_filters1 = args.get('nb_filters1', 48) + nb_filters2 = args.get('nb_filters2', 96) + nb_filters3 = args.get('nb_filters3', 192) + + # tune the pool size once + ps = args.get('pool_size', 2) + pool_size = (ps,ps) + + # tune the dropout rates independently + do4 = args.get('dropout1', 0.25) + do5 = args.get('dropout2', 0.5) + + # tune the dense layers independently + dense1 = args.get('dense1', 512) + dense2 = args.get('dense2', 256) + + if K.image_data_format() == 'channels_first': + input_shape = (3, img_rows, img_cols) + else: + input_shape = (img_rows, img_cols, 3) + + #act = 'sigmoid' + act = 'relu' + + i = Input( input_shape) + l = Conv2D(nb_filters1,( ks, ks), padding='same', activation = act)(i) + l = MaxPooling2D(pool_size=pool_size)(l) + #l = Dropout(do1)(l) + + l = Conv2D(nb_filters2, (ks, ks), padding='same',activation=act)(l) + #l = Conv2D(nb_filters2, (ks, ks))(l) + l = MaxPooling2D(pool_size=pool_size)(l) + #l = Dropout(do2)(l) + + l = Conv2D(nb_filters3, (ks, ks), padding='same',activation=act)(l) + #l = Conv2D(nb_filters3, (ks, ks))(l) + l = MaxPooling2D(pool_size=pool_size)(l) + #l = Dropout(do3)(l) + + l = Flatten()(l) + l = Dense(dense1,activation=act)(l) + l = Dropout(do4)(l) + l = Dense(dense2,activation=act)(l) + l =Dropout(do5)(l) + + o = Dense(nb_classes, activation='softmax')(l) + + model = Model(inputs=i, outputs=o) + #model.summary() + + return model + +from skopt.space import Real, Integer, Categorical +make_cifar10_model.parameter_range = [ + Integer(10,300, name='nb_filters1'), + Integer(10,300, name='nb_filters2'), + Integer(10,300, name='nb_filters3'), + Integer(50,1000, name='dense1'), + Integer(50,1000, name='dense2'), + Real(0.0, 1.0, name='dropout1'), + Real(0.0, 1.0, name='dropout2') +] From c137c1131cea95a4c13f8b5110c3962878e06c05 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 4 Aug 2020 12:17:34 -0400 Subject: [PATCH 11/18] Remove example/ dependence in TrainDrive --- nnlo/driver/TrainingDriver.py | 8 ++++---- nnlo/examples/example_cifar10.py | 4 ++-- nnlo/examples/example_mnist.py | 4 ++-- nnlo/models/model_cifar10_tf.py | 7 +++++-- nnlo/models/model_mnist_tf.py | 7 +++++-- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/nnlo/driver/TrainingDriver.py b/nnlo/driver/TrainingDriver.py index c4cff2c..9efcb82 100755 --- a/nnlo/driver/TrainingDriver.py +++ b/nnlo/driver/TrainingDriver.py @@ -239,14 +239,14 @@ def main(): m_module, model_source = None, None if args.model == 'mnist': try: - m_module = importlib.import_module(f'nnlo.examples.example_mnist') - model_source = 'examples/example_mnist.py' + m_module = importlib.import_module(f'nnlo.models.model_mnist_tf') + model_source = 'models/model_mnist_tf.py' except Exception as e: logging.fatal(e) elif args.model == 'cifar10': try: - m_module = importlib.import_module(f'nnlo.examples.example_cifar10') - model_source = 'examples/example_cifar10.py' + m_module = importlib.import_module(f'nnlo.models.model_cifar10_tf') + model_source = 'models/model_cifar10_tf.py' except Exception as e: logging.fatal(e) diff --git a/nnlo/examples/example_cifar10.py b/nnlo/examples/example_cifar10.py index 094234f..befd3c1 100644 --- a/nnlo/examples/example_cifar10.py +++ b/nnlo/examples/example_cifar10.py @@ -1,8 +1,8 @@ from nnlo.models.model_cifar10_tf import make_cifar10_model get_model = make_cifar10_model -def get_name(): - return 'cifar10' +#def get_name(): +# return 'cifar10' def get_all(): import socket,os,glob diff --git a/nnlo/examples/example_mnist.py b/nnlo/examples/example_mnist.py index cdf8424..008ff90 100644 --- a/nnlo/examples/example_mnist.py +++ b/nnlo/examples/example_mnist.py @@ -1,8 +1,8 @@ from nnlo.models.model_mnist_tf import make_mnist_model get_model = make_mnist_model -def get_name(): - return 'mnist' +#def get_name(): +# return 'mnist' def get_all(): import socket,os,glob diff --git a/nnlo/models/model_cifar10_tf.py b/nnlo/models/model_cifar10_tf.py index 1f20e16..7b693c4 100644 --- a/nnlo/models/model_cifar10_tf.py +++ b/nnlo/models/model_cifar10_tf.py @@ -2,7 +2,10 @@ # Rui Zhang 8.2020 # rui.zhang@cern.ch -def make_cifar10_model(**args): +def get_name(): + return 'cifar10' + +def get_model(**args): from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D @@ -68,7 +71,7 @@ def make_cifar10_model(**args): return model from skopt.space import Real, Integer, Categorical -make_cifar10_model.parameter_range = [ +get_model.parameter_range = [ Integer(10,300, name='nb_filters1'), Integer(10,300, name='nb_filters2'), Integer(10,300, name='nb_filters3'), diff --git a/nnlo/models/model_mnist_tf.py b/nnlo/models/model_mnist_tf.py index 4ff013a..8c15019 100644 --- a/nnlo/models/model_mnist_tf.py +++ b/nnlo/models/model_mnist_tf.py @@ -2,7 +2,10 @@ # Rui Zhang 8.2020 # rui.zhang@cern.ch -def make_mnist_model(**args): +def get_name(): + return 'mnist' + +def get_model(**args): from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D @@ -46,7 +49,7 @@ def make_mnist_model(**args): return model from skopt.space import Real, Integer, Categorical -make_mnist_model.parameter_range = [ +get_model.parameter_range = [ Integer(10,50, name='nb_filters'), Integer(2,10, name='pool_size'), Integer(2,10, name='kernel_size'), From 0b88e22a77664adc4bae3adea6fb7320585df0cb Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 4 Aug 2020 12:45:25 -0400 Subject: [PATCH 12/18] Separate data/ and models/ and extract all model definitions from models/Model.py --- nnlo/data/__init__.py | 0 nnlo/{models => data}/get_3d.py | 0 nnlo/{models => data}/get_cifar10.py | 9 +- nnlo/{models => data}/get_mnist.py | 11 +- nnlo/{models => data}/get_topclass.py | 0 nnlo/{models => data}/getdata.py | 4 +- nnlo/examples/example_cifar10.py | 4 - nnlo/examples/example_mnist.py | 4 - nnlo/models/Models.py | 232 +------------------------- nnlo/models/model_example_tf.py | 17 ++ nnlo/models/model_mnist_torch.py | 22 +++ nnlo/models/model_topclass_tf.py | 55 ++++++ nnlo/models/model_topclass_torch.py | 27 +++ setup.py | 2 +- 14 files changed, 139 insertions(+), 248 deletions(-) create mode 100644 nnlo/data/__init__.py rename nnlo/{models => data}/get_3d.py (100%) rename nnlo/{models => data}/get_cifar10.py (89%) rename nnlo/{models => data}/get_mnist.py (88%) rename nnlo/{models => data}/get_topclass.py (100%) rename nnlo/{models => data}/getdata.py (79%) create mode 100644 nnlo/models/model_example_tf.py create mode 100644 nnlo/models/model_mnist_torch.py create mode 100644 nnlo/models/model_topclass_tf.py create mode 100644 nnlo/models/model_topclass_torch.py diff --git a/nnlo/data/__init__.py b/nnlo/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nnlo/models/get_3d.py b/nnlo/data/get_3d.py similarity index 100% rename from nnlo/models/get_3d.py rename to nnlo/data/get_3d.py diff --git a/nnlo/models/get_cifar10.py b/nnlo/data/get_cifar10.py similarity index 89% rename from nnlo/models/get_cifar10.py rename to nnlo/data/get_cifar10.py index 77ad4c1..82425bf 100644 --- a/nnlo/models/get_cifar10.py +++ b/nnlo/data/get_cifar10.py @@ -6,7 +6,6 @@ from tensorflow.python.keras.utils import np_utils from tensorflow.python.keras import backend as K import h5py -import sys import os def main(argv): @@ -32,25 +31,25 @@ def main(argv): train_list = [] for i in range(num_train_pieces): - train_name = "cifar10_train_%d.h5" % i + train_name = f"{os.getcwd()}/cifar10_train_%d.h5" % i train_list.append(train_name+"\n") train_outfile = h5py.File( train_name, 'w' ) train_outfile.create_dataset( "features", data=split_X_train[i] ) train_outfile.create_dataset( "labels", data=split_Y_train[i] ) train_outfile.close() - with open(f'{os.getcwd()}/train_cifar10.list', 'w') as train_list_file: + with open('train_cifar10.list', 'w') as train_list_file: for f in train_list: train_list_file.write(f) test_list = [] for i in range(num_test_pieces): - test_name = "cifar10_test_%d.h5" % i + test_name = f"{os.getcwd()}/cifar10_test_%d.h5" % i test_list.append(test_name+"\n") test_outfile = h5py.File( test_name, 'w' ) test_outfile.create_dataset( "features", data=split_X_test[i] ) test_outfile.create_dataset( "labels", data=split_Y_test[i] ) test_outfile.close() - with open(f'{os.getcwd()}/test_cifar10.list', 'w') as test_list_file: + with open('test_cifar10.list', 'w') as test_list_file: for f in test_list: test_list_file.write(f) diff --git a/nnlo/models/get_mnist.py b/nnlo/data/get_mnist.py similarity index 88% rename from nnlo/models/get_mnist.py rename to nnlo/data/get_mnist.py index 416a80a..93675f0 100644 --- a/nnlo/models/get_mnist.py +++ b/nnlo/data/get_mnist.py @@ -6,7 +6,6 @@ from tensorflow.python.keras.utils import np_utils from tensorflow.python.keras import backend as K import h5py -import sys import os def main(argv): @@ -32,25 +31,25 @@ def main(argv): train_list = [] for i in range(num_train_pieces): - train_name = "mnist_train_%d.h5" % i + train_name = f"{os.getcwd()}/mnist_train_%d.h5" % i train_list.append(train_name+"\n") train_outfile = h5py.File( train_name, 'w' ) train_outfile.create_dataset( "features", data=split_X_train[i] ) train_outfile.create_dataset( "labels", data=split_Y_train[i] ) train_outfile.close() - with open(f'{os.getcwd()}/train_mnist.list', 'w') as train_list_file: + with open('train_mnist.list', 'w') as train_list_file: for f in train_list: train_list_file.write(f) test_list = [] for i in range(num_test_pieces): - test_name = "mnist_test_%d.h5" % i - test_list.append(test_name+"\n") + test_name = f"{os.getcwd()}/mnist_test_%d.h5" % i + test_list.append(os.getcwd()+test_name+"\n") test_outfile = h5py.File( test_name, 'w' ) test_outfile.create_dataset( "features", data=split_X_test[i] ) test_outfile.create_dataset( "labels", data=split_Y_test[i] ) test_outfile.close() - with open(f'{os.getcwd()}/test_mnist.list', 'w') as test_list_file: + with open('test_mnist.list', 'w') as test_list_file: for f in test_list: test_list_file.write(f) diff --git a/nnlo/models/get_topclass.py b/nnlo/data/get_topclass.py similarity index 100% rename from nnlo/models/get_topclass.py rename to nnlo/data/get_topclass.py diff --git a/nnlo/models/getdata.py b/nnlo/data/getdata.py similarity index 79% rename from nnlo/models/getdata.py rename to nnlo/data/getdata.py index 113cbb5..f5db855 100644 --- a/nnlo/models/getdata.py +++ b/nnlo/data/getdata.py @@ -8,10 +8,10 @@ def main(): command = sys.argv[1] argv = sys.argv[2:] if command.lower() == 'mnist': - from nnlo.models.get_mnist import main + from nnlo.data.get_mnist import main main(argv) elif command.lower() == 'cifar10': - from nnlo.models.get_cifar10 import main + from nnlo.data.get_cifar10 import main main(argv) else: raise RuntimeError('Unknown command: {}'.format(command)) diff --git a/nnlo/examples/example_cifar10.py b/nnlo/examples/example_cifar10.py index befd3c1..f4fd150 100644 --- a/nnlo/examples/example_cifar10.py +++ b/nnlo/examples/example_cifar10.py @@ -1,9 +1,5 @@ from nnlo.models.model_cifar10_tf import make_cifar10_model -get_model = make_cifar10_model -#def get_name(): -# return 'cifar10' - def get_all(): import socket,os,glob host = os.environ.get('HOST',os.environ.get('HOSTNAME',socket.gethostname())) diff --git a/nnlo/examples/example_mnist.py b/nnlo/examples/example_mnist.py index 008ff90..a809c00 100644 --- a/nnlo/examples/example_mnist.py +++ b/nnlo/examples/example_mnist.py @@ -1,9 +1,5 @@ from nnlo.models.model_mnist_tf import make_mnist_model -get_model = make_mnist_model -#def get_name(): -# return 'mnist' - def get_all(): import socket,os,glob host = os.environ.get('HOST',os.environ.get('HOSTNAME',socket.gethostname())) diff --git a/nnlo/models/Models.py b/nnlo/models/Models.py index d587b06..67c89cd 100644 --- a/nnlo/models/Models.py +++ b/nnlo/models/Models.py @@ -8,12 +8,12 @@ def model_function(model_name): """Constructs the Keras model indicated by model_name""" model_maker_dict = { - 'example':make_example_model, - 'mnist':make_mnist_model, - 'cifar10':make_cifar10_model, - 'mnist_torch':make_mnist_torch_model, - 'topclass': make_topclass_model, - 'topclass_torch':make_topclass_torch_model + # 'example':make_example_model, + # 'mnist':make_mnist_model, + # 'cifar10':make_cifar10_model, + # 'mnist_torch':make_mnist_torch_model, + # 'topclass': make_topclass_model, + # 'topclass_torch':make_topclass_torch_model } return model_maker_dict[model_name] @@ -27,223 +27,3 @@ def make_model(model_name, **args): sys.exit(-1) return model_function(model_name)(**args) -def make_example_model(): - """Example model from keras documentation""" - from tensorflow.keras.models import Sequential - from tensorflow.keras.layers import Dense, Activation - model = Sequential() - model.add(Dense(output_dim=64, input_dim=100)) - model.add(Activation("relu")) - model.add(Dense(output_dim=10)) - model.add(Activation("softmax")) - return model - -def make_topclass_model(**args): - from tensorflow.keras.models import Sequential, Model - from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute - from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D - if args:logging.debug("receiving arguments {}".format(args)) - conv_layers=args.get('conv_layers',2) - dense_layers=args.get('dense_layers',2) - dropout=args.get('dropout',0.2) - kernel = args.get('kernel_size',3) - classes=3 - in_channels=5 - in_ch = in_channels - ## the trace in the input file is 750, 150, 94, 5 - input = Input( (150,94,in_ch)) - ## convs - c = input - for i in range(conv_layers): - channel_in = in_ch*((i+1)%5) - channel_out = in_ch*((i+2)%5) - if channel_in == 0: channel_in += 1 - if channel_out == 0: channel_out += 1 - c = Conv2D( filters=channel_out, kernel_size=(kernel,kernel) , strides=1, padding="same", activation = 'relu') (c) - c = Conv2D(1, (kernel,kernel), activation = 'relu',strides=2, padding="same")(c) - - ## pooling - pool = args.get('pool', 10) - m = MaxPooling2D((pool,pool))(c) - f = Flatten()(m) - d = f - base = args.get('hidden_factor',5)*100 - for i in range(dense_layers): - N = int(base//(2**(i+1))) - d = Dense( N, activation='relu')(d) - if dropout: - d = Dropout(dropout)(d) - o = Dense(classes, activation='softmax')(d) - - model = Model(inputs=input, outputs=o) - #model.summary() - return model - -#def make_cifar10_model(**args): -# from tensorflow.keras.models import Sequential, Model -# from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute -# from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D -# import tensorflow.keras.backend as K -# if args:logging.debug("receiving arguments {}".format(args)) -# nb_classes = 10 -# img_rows, img_cols = 32, 32 -# -# # use 1 kernel size for all convolutional layers -# ks = args.get('kernel_size', 3) -# -# # tune the number of filters for each convolution layer -# nb_filters1 = args.get('nb_filters1', 48) -# nb_filters2 = args.get('nb_filters2', 96) -# nb_filters3 = args.get('nb_filters3', 192) -# -# # tune the pool size once -# ps = args.get('pool_size', 2) -# pool_size = (ps,ps) -# -# # tune the dropout rates independently -# do4 = args.get('dropout1', 0.25) -# do5 = args.get('dropout2', 0.5) -# -# # tune the dense layers independently -# dense1 = args.get('dense1', 512) -# dense2 = args.get('dense2', 256) -# -# if K.image_data_format() == 'channels_first': -# input_shape = (3, img_rows, img_cols) -# else: -# input_shape = (img_rows, img_cols, 3) -# -# #act = 'sigmoid' -# act = 'relu' -# -# i = Input( input_shape) -# l = Conv2D(nb_filters1,( ks, ks), padding='same', activation = act)(i) -# l = MaxPooling2D(pool_size=pool_size)(l) -# #l = Dropout(do1)(l) -# -# l = Conv2D(nb_filters2, (ks, ks), padding='same',activation=act)(l) -# #l = Conv2D(nb_filters2, (ks, ks))(l) -# l = MaxPooling2D(pool_size=pool_size)(l) -# #l = Dropout(do2)(l) -# -# l = Conv2D(nb_filters3, (ks, ks), padding='same',activation=act)(l) -# #l = Conv2D(nb_filters3, (ks, ks))(l) -# l = MaxPooling2D(pool_size=pool_size)(l) -# #l = Dropout(do3)(l) -# -# l = Flatten()(l) -# l = Dense(dense1,activation=act)(l) -# l = Dropout(do4)(l) -# l = Dense(dense2,activation=act)(l) -# l =Dropout(do5)(l) -# -# o = Dense(nb_classes, activation='softmax')(l) -# -# model = Model(inputs=i, outputs=o) -# #model.summary() -# -# return model -# -#def make_mnist_model(**args): -# from tensorflow.keras.models import Sequential, Model -# from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute -# from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D -# import tensorflow.keras.backend as K -# """MNIST ConvNet from keras/examples/mnist_cnn.py""" -# #np.random.seed(1337) # for reproducibility -# if args:logging.debug("receiving arguments {}".format(args)) -# nb_classes = 10 -# # input image dimensions -# img_rows, img_cols = 28, 28 -# # number of convolutional filters to use -# nb_filters = args.get('nb_filters',32) -# # size of pooling area for max pooling -# ps = args.get('pool_size',2) -# -# # convolution kernel size -# ks = args.get('kernel_size',3) -# do = args.get('dropout', 0.25) -# dense = args.get('dense', 128) -# -# pool_size = (ps,ps) -# if K.image_data_format() == 'channels_first': -# input_shape = (1, img_rows, img_cols) -# else: -# input_shape = (img_rows, img_cols, 1) -# model = Sequential() -# model.add(Convolution2D(nb_filters, (ks, ks), -# padding='valid', -# input_shape=input_shape)) -# model.add(Activation('relu')) -# model.add(Convolution2D(nb_filters, (ks, ks))) -# model.add(Activation('relu')) -# model.add(MaxPooling2D(pool_size=pool_size)) -# model.add(Dropout(do)) -# model.add(Flatten()) -# model.add(Dense(dense)) -# model.add(Activation('relu')) -# model.add(Dropout(do)) -# model.add(Dense(nb_classes)) -# model.add(Activation('softmax')) -# return model - -def make_mnist_torch_model(**args): - if args:logging.debug("receiving arguments {}".format(args)) - try: - from TorchModels import MNistNet - except: - from .TorchModels import MNistNet - model = MNistNet(**args) - return model - -def make_topclass_torch_model(**args): - if args:logging.debug("receiving arguments {}".format(args)) - conv_layers=args.get('conv_layers',2) - dense_layers=args.get('dense_layers',2) - dropout=args.get('dropout',0.5) - classes=3 - in_channels=5 - try: - from TorchModels import CNN - except: - from .TorchModels import CNN - model = CNN(conv_layers=conv_layers, dense_layers=dense_layers, dropout=dropout, classes=classes, in_channels=in_channels) - return model - -try: - from skopt.space import Real, Integer, Categorical -# make_mnist_model.parameter_range = [ -# Integer(10,50, name='nb_filters'), -# Integer(2,10, name='pool_size'), -# Integer(2,10, name='kernel_size'), -# Integer(50,200, name='dense'), -# Real(0.0, 1.0, name='dropout') -# ] - make_mnist_torch_model.parameter_range = [ - Integer(2,10, name='kernel_size'), - Integer(50,200, name='dense'), - Real(0.0, 1.0, name='dropout') - ] - make_topclass_model.parameter_range = [ - Integer(1,6, name='conv_layers'), - Integer(1,6, name='dense_layers'), - Integer(1,6, name='kernel_size'), - Real(0.0, 1.0, name='dropout') - ] - make_topclass_torch_model.parameter_range = [ - Integer(1,6, name='conv_layers'), - Integer(1,6, name='dense_layers'), - Real(0.0,1.0, name='dropout') - ] -# make_cifar10_model.parameter_range = [ -# Integer(10,300, name='nb_filters1'), -# Integer(10,300, name='nb_filters2'), -# Integer(10,300, name='nb_filters3'), -# Integer(50,1000, name='dense1'), -# Integer(50,1000, name='dense2'), -# Real(0.0, 1.0, name='dropout1'), -# Real(0.0, 1.0, name='dropout2') -# ] -except: - pass - diff --git a/nnlo/models/model_example_tf.py b/nnlo/models/model_example_tf.py new file mode 100644 index 0000000..43d8c2e --- /dev/null +++ b/nnlo/models/model_example_tf.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def get_name(): + return 'example' + +def get_model(**args): + """Example model from keras documentation""" + from tensorflow.keras.models import Sequential + from tensorflow.keras.layers import Dense, Activation + model = Sequential() + model.add(Dense(output_dim=64, input_dim=100)) + model.add(Activation("relu")) + model.add(Dense(output_dim=10)) + model.add(Activation("softmax")) + return model diff --git a/nnlo/models/model_mnist_torch.py b/nnlo/models/model_mnist_torch.py new file mode 100644 index 0000000..7e594c6 --- /dev/null +++ b/nnlo/models/model_mnist_torch.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def get_name(): + return 'mnist_torch' + +def get_model(**args): + if args:logging.debug("receiving arguments {}".format(args)) + try: + from TorchModels import MNistNet + except: + from .TorchModels import MNistNet + model = MNistNet(**args) + return model + +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Integer(2,10, name='kernel_size'), + Integer(50,200, name='dense'), + Real(0.0, 1.0, name='dropout') +] diff --git a/nnlo/models/model_topclass_tf.py b/nnlo/models/model_topclass_tf.py new file mode 100644 index 0000000..18cc99b --- /dev/null +++ b/nnlo/models/model_topclass_tf.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def get_name(): + return 'topclass' + +def get_model(**args): + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute + from tensorflow.keras.layers import Convolution2D, MaxPooling2D, Conv2D + if args:logging.debug("receiving arguments {}".format(args)) + conv_layers=args.get('conv_layers',2) + dense_layers=args.get('dense_layers',2) + dropout=args.get('dropout',0.2) + kernel = args.get('kernel_size',3) + classes=3 + in_channels=5 + in_ch = in_channels + ## the trace in the input file is 750, 150, 94, 5 + input = Input( (150,94,in_ch)) + ## convs + c = input + for i in range(conv_layers): + channel_in = in_ch*((i+1)%5) + channel_out = in_ch*((i+2)%5) + if channel_in == 0: channel_in += 1 + if channel_out == 0: channel_out += 1 + c = Conv2D( filters=channel_out, kernel_size=(kernel,kernel) , strides=1, padding="same", activation = 'relu') (c) + c = Conv2D(1, (kernel,kernel), activation = 'relu',strides=2, padding="same")(c) + + ## pooling + pool = args.get('pool', 10) + m = MaxPooling2D((pool,pool))(c) + f = Flatten()(m) + d = f + base = args.get('hidden_factor',5)*100 + for i in range(dense_layers): + N = int(base//(2**(i+1))) + d = Dense( N, activation='relu')(d) + if dropout: + d = Dropout(dropout)(d) + o = Dense(classes, activation='softmax')(d) + + model = Model(inputs=input, outputs=o) + #model.summary() + return model + +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Integer(1,6, name='conv_layers'), + Integer(1,6, name='dense_layers'), + Integer(1,6, name='kernel_size'), + Real(0.0, 1.0, name='dropout') +] diff --git a/nnlo/models/model_topclass_torch.py b/nnlo/models/model_topclass_torch.py new file mode 100644 index 0000000..abca6e6 --- /dev/null +++ b/nnlo/models/model_topclass_torch.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +def get_name(): + return 'topclass_torch' + +def get_model(**args): + if args:logging.debug("receiving arguments {}".format(args)) + conv_layers=args.get('conv_layers',2) + dense_layers=args.get('dense_layers',2) + dropout=args.get('dropout',0.5) + classes=3 + in_channels=5 + try: + from TorchModels import CNN + except: + from .TorchModels import CNN + model = CNN(conv_layers=conv_layers, dense_layers=dense_layers, dropout=dropout, classes=classes, in_channels=in_channels) + return model + +from skopt.space import Real, Integer, Categorical +get_model.parameter_range = [ + Integer(1,6, name='conv_layers'), + Integer(1,6, name='dense_layers'), + Real(0.0,1.0, name='dropout') +] diff --git a/setup.py b/setup.py index 33fdc92..38aa5b5 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ version='0.0.5', entry_points = { 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main', - 'GetData=nnlo.models.getdata:main', + 'GetData=nnlo.data.getdata:main', 'CountEpoch=nnlo.util.count_epoch:main', ], }, From 2cabcbab4467d94b2b22a39fbfad08a397aab9df Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 4 Aug 2020 13:32:23 -0400 Subject: [PATCH 13/18] Enable torch models --- nnlo/driver/TrainingDriver.py | 18 +++++++++--------- nnlo/models/TorchModels.py | 29 ----------------------------- nnlo/models/model_mnist_torch.py | 31 +++++++++++++++++++++++++++---- nnlo/train/model.py | 8 +++++--- 4 files changed, 41 insertions(+), 45 deletions(-) diff --git a/nnlo/driver/TrainingDriver.py b/nnlo/driver/TrainingDriver.py index 9efcb82..1bc076e 100755 --- a/nnlo/driver/TrainingDriver.py +++ b/nnlo/driver/TrainingDriver.py @@ -91,7 +91,7 @@ def add_train_options(parser): parser.add_argument('--thread_validation', help='run a single process', action='store_true') # model arguments - parser.add_argument('--model', choices=['mnist', 'cifar10'], help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') + parser.add_argument('--model', choices=['mnist', 'mnist_torch', 'cifar10', 'cifar10_torch'], help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') parser.add_argument('--trial-name', help='descriptive name for trial', default='train', dest='trial_name') @@ -237,18 +237,18 @@ def main(): a_backend = 'torch' m_module, model_source = None, None - if args.model == 'mnist': - try: + try: + if args.model == 'mnist': m_module = importlib.import_module(f'nnlo.models.model_mnist_tf') model_source = 'models/model_mnist_tf.py' - except Exception as e: - logging.fatal(e) - elif args.model == 'cifar10': - try: + elif args.model == 'mnist_torch': + m_module = importlib.import_module(f'nnlo.models.model_mnist_torch') + model_source = 'models/model_mnist_torch.py' + elif args.model == 'cifar10': m_module = importlib.import_module(f'nnlo.models.model_cifar10_tf') model_source = 'models/model_cifar10_tf.py' - except Exception as e: - logging.fatal(e) + except Exception as e: + logging.fatal(e) (features_name, labels_name) = make_features_labels(m_module, args) (train_list, val_list) = make_train_val_lists(m_module, args) diff --git a/nnlo/models/TorchModels.py b/nnlo/models/TorchModels.py index 2b412c0..3a9b9bc 100644 --- a/nnlo/models/TorchModels.py +++ b/nnlo/models/TorchModels.py @@ -1,9 +1,6 @@ import torch from torch.autograd import Variable import torch.nn as nn -import torch.nn.parallel -import torch.backends.cudnn as cudnn -import torch.distributed as dist import torch.optim import torch.utils.data.distributed import torchvision.transforms as transforms @@ -13,32 +10,6 @@ import numpy import logging -class MNistNet(nn.Module): - def __init__(self, **args): - super(MNistNet, self).__init__() - ks = int(args.get('kernel_size',5)) - do = float(args.get('dropout',0.5)) - dense = int(args.get('dense',50)) - self.conv1 = nn.Conv2d(1, 10, kernel_size=ks) - self.conv2 = nn.Conv2d(10, 20, kernel_size=ks) - self.conv2_drop = nn.Dropout2d(do) - self.fc1 = nn.Linear(320, dense) - self.fc2 = nn.Linear(dense, 10) - - def forward(self, x): - x = x.permute(0,3,1,2).float() - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - #return F.log_softmax(x, dim=1) - #return F.softmax(x) - #return F.cross_entropy(x) - return x - - ### Build a customized CNN with given hyperparameters class _ConvBlock(nn.Sequential): diff --git a/nnlo/models/model_mnist_torch.py b/nnlo/models/model_mnist_torch.py index 7e594c6..e0a6df6 100644 --- a/nnlo/models/model_mnist_torch.py +++ b/nnlo/models/model_mnist_torch.py @@ -1,16 +1,39 @@ #!/usr/bin/env python # Rui Zhang 8.2020 # rui.zhang@cern.ch +import torch.nn as nn +import torch.nn.functional as F def get_name(): return 'mnist_torch' +class MNistNet(nn.Module): + def __init__(self, **args): + super(MNistNet, self).__init__() + ks = int(args.get('kernel_size',5)) + do = float(args.get('dropout',0.5)) + dense = int(args.get('dense',50)) + self.conv1 = nn.Conv2d(1, 10, kernel_size=ks) + self.conv2 = nn.Conv2d(10, 20, kernel_size=ks) + self.conv2_drop = nn.Dropout2d(do) + self.fc1 = nn.Linear(320, dense) + self.fc2 = nn.Linear(dense, 10) + + def forward(self, x): + x = x.permute(0,3,1,2).float() + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + #return F.log_softmax(x, dim=1) + #return F.softmax(x) + #return F.cross_entropy(x) + return x + def get_model(**args): if args:logging.debug("receiving arguments {}".format(args)) - try: - from TorchModels import MNistNet - except: - from .TorchModels import MNistNet model = MNistNet(**args) return model diff --git a/nnlo/train/model.py b/nnlo/train/model.py index f313fd5..1607eda 100644 --- a/nnlo/train/model.py +++ b/nnlo/train/model.py @@ -356,8 +356,10 @@ def test_on_batch(self, x=None, y=None, *args, **kwargs): if self.gpus > 0: x = x.cuda() target = target.cuda() - pred = self.model.forward(Variable(x, volatile=True)) - loss = self.loss(pred, Variable(target, volatile=True)) + import torch + with torch.no_grad(): + pred = self.model.forward(Variable(x)) + loss = self.loss(pred, Variable(target)) l_data = loss.data.numpy() if self.gpus == 0 else loss.data.cpu().numpy() self.metrics = [l_data] if l_data.shape==() else [l_data[0]] if 'acc' in self.metrics_names: # compute the accuracy @@ -495,7 +497,7 @@ def __init__(self, comm, source, super(ModelPytorch,self).__init__(comm) if isinstance(source, six.string_types): if source.endswith('.py'): - module = __import__(source.replace('.py','').replace('/', '.'), fromlist=[None]) + module = __import__('nnlo.'+source.replace('.py','').replace('/', '.'), fromlist=[None]) self.model = module.get_model() self.filename = None else: From b65f3f3dc4ec697f7becdf78106609e422a4acd7 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 4 Aug 2020 16:40:44 -0400 Subject: [PATCH 14/18] Remove all dependence on examples/ --- nnlo/__init__.py | 7 ---- nnlo/driver/TrainingDriver.py | 29 ++++----------- nnlo/examples/__init__.py | 0 nnlo/examples/example_cifar10.py | 30 ---------------- nnlo/examples/example_mnist.py | 35 ------------------- nnlo/examples/example_mnist_torch.py | 5 --- .../model_hls4mlgru.py} | 4 +-- .../model_jedi_torch.py} | 0 setup.py | 2 +- 9 files changed, 10 insertions(+), 102 deletions(-) delete mode 100644 nnlo/examples/__init__.py delete mode 100644 nnlo/examples/example_cifar10.py delete mode 100644 nnlo/examples/example_mnist.py delete mode 100644 nnlo/examples/example_mnist_torch.py rename nnlo/{examples/example_hls4mlgru.py => models/model_hls4mlgru.py} (96%) rename nnlo/{examples/example_jedi_torch.py => models/model_jedi_torch.py} (100%) diff --git a/nnlo/__init__.py b/nnlo/__init__.py index 458a243..e69de29 100644 --- a/nnlo/__init__.py +++ b/nnlo/__init__.py @@ -1,7 +0,0 @@ -from nnlo.optimize import * -from nnlo.mpi import * -from nnlo.train import * -from nnlo.util import * -from nnlo.models import * -from nnlo.examples import * -from nnlo.driver import * diff --git a/nnlo/driver/TrainingDriver.py b/nnlo/driver/TrainingDriver.py index 1bc076e..01be474 100755 --- a/nnlo/driver/TrainingDriver.py +++ b/nnlo/driver/TrainingDriver.py @@ -96,8 +96,8 @@ def add_train_options(parser): default='train', dest='trial_name') # training data arguments - parser.add_argument('--train_data', help='text file listing data inputs for training', default=None) - parser.add_argument('--val_data', help='text file lis`ting data inputs for validation', default=None) + parser.add_argument('--train_data', help='text file listing data inputs for training', required=True) + parser.add_argument('--val_data', help='text file lis`ting data inputs for validation', required=True) parser.add_argument('--features-name', help='name of HDF5 dataset with input features', default='features', dest='features_name') parser.add_argument('--labels-name', help='name of HDF5 dataset with output labels', @@ -200,21 +200,11 @@ def make_algo( args, use_tf, comm, validate_every ): def make_train_val_lists(m_module, args): train_list = val_list = [] - if args.train_data: - with open(args.train_data) as train_list_file: - train_list = [ s.strip() for s in train_list_file.readlines() ] - elif m_module is not None: - train_list = m_module.get_train() - else: - logging.info("no training data provided") + with open(args.train_data) as train_list_file: + train_list = [ s.strip() for s in train_list_file.readlines() ] - if args.val_data: - with open(args.val_data) as val_list_file: - val_list = [ s.strip() for s in val_list_file.readlines() ] - elif m_module is not None: - val_list = m_module.get_val() - else: - logging.info("no validation data provided") + with open(args.val_data) as val_list_file: + val_list = [ s.strip() for s in val_list_file.readlines() ] if not train_list: logging.error("No training data provided") @@ -222,11 +212,6 @@ def make_train_val_lists(m_module, args): logging.error("No validation data provided") return (train_list, val_list) -def make_features_labels(m_module, args): - features_name = m_module.get_features() if m_module is not None and hasattr(m_module,"get_features") else args.features_name - labels_name = m_module.get_labels() if m_module is not None and hasattr(m_module,"get_labels") else args.labels_name - return (features_name, labels_name) - def main(): parser = make_train_parser() args = parser.parse_args() @@ -250,7 +235,7 @@ def main(): except Exception as e: logging.fatal(e) - (features_name, labels_name) = make_features_labels(m_module, args) + (features_name, labels_name) = args.features_name, args.labels_name (train_list, val_list) = make_train_val_lists(m_module, args) comm = MPI.COMM_WORLD.Dup() diff --git a/nnlo/examples/__init__.py b/nnlo/examples/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/nnlo/examples/example_cifar10.py b/nnlo/examples/example_cifar10.py deleted file mode 100644 index f4fd150..0000000 --- a/nnlo/examples/example_cifar10.py +++ /dev/null @@ -1,30 +0,0 @@ -from nnlo.models.model_cifar10_tf import make_cifar10_model - -def get_all(): - import socket,os,glob - host = os.environ.get('HOST',os.environ.get('HOSTNAME',socket.gethostname())) - - all_list = glob.glob('mnist_*.h5') - if not all_list: - all_list = glob.glob('mnist_*.h5') - return all_list - -def get_train(): - all_list = get_all() - l = int( len(all_list)*0.70) - train_list = all_list[:l] - return train_list - -def get_val(): - all_list = get_all() - l = int( len(all_list)*0.70) - val_list = all_list[l:] - return val_list - -def get_features(): - #return ('features', lambda x: x) ##example of data adaptor - return 'features' - -def get_labels(): - return 'labels' - diff --git a/nnlo/examples/example_mnist.py b/nnlo/examples/example_mnist.py deleted file mode 100644 index a809c00..0000000 --- a/nnlo/examples/example_mnist.py +++ /dev/null @@ -1,35 +0,0 @@ -from nnlo.models.model_mnist_tf import make_mnist_model - -def get_all(): - import socket,os,glob - host = os.environ.get('HOST',os.environ.get('HOSTNAME',socket.gethostname())) - - if 'daint' in host: - all_list = glob.glob('/scratch/snx3000/vlimant/data/mnist/*.h5') - elif 'titan' in host: - all_list = glob.glob('/ccs/proj/csc291/DATA/mnist/*.h5') - else: - all_list = glob.glob('/bigdata/shared/mnist/*.h5') - if not all_list: - all_list = glob.glob('mnist_*.h5') - return all_list - -def get_train(): - all_list = get_all() - l = int( len(all_list)*0.70) - train_list = all_list[:l] - return train_list - -def get_val(): - all_list = get_all() - l = int( len(all_list)*0.70) - val_list = all_list[l:] - return val_list - -def get_features(): - #return ('features', lambda x: x) ##example of data adaptor - return 'features' - -def get_labels(): - return 'labels' - diff --git a/nnlo/examples/example_mnist_torch.py b/nnlo/examples/example_mnist_torch.py deleted file mode 100644 index 8e7fddf..0000000 --- a/nnlo/examples/example_mnist_torch.py +++ /dev/null @@ -1,5 +0,0 @@ -from nnlo.models.Models import make_mnist_torch_model -from nnlo.examples.example_mnist import * - -get_model = make_mnist_torch_model - diff --git a/nnlo/examples/example_hls4mlgru.py b/nnlo/models/model_hls4mlgru.py similarity index 96% rename from nnlo/examples/example_hls4mlgru.py rename to nnlo/models/model_hls4mlgru.py index fa168d7..2eb3783 100644 --- a/nnlo/examples/example_hls4mlgru.py +++ b/nnlo/models/model_hls4mlgru.py @@ -1,6 +1,6 @@ #from keras.activations import relu, selu, elu -from keras.models import Model, Sequential -from keras.layers import Dense, Input, GRU, Dropout, Flatten, Permute +from keras.models import Model +from keras.layers import Dense, Input, GRU, Dropout, Permute import numpy as np def get_model(**args): diff --git a/nnlo/examples/example_jedi_torch.py b/nnlo/models/model_jedi_torch.py similarity index 100% rename from nnlo/examples/example_jedi_torch.py rename to nnlo/models/model_jedi_torch.py diff --git a/setup.py b/setup.py index 38aa5b5..f60a4d6 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup_args = dict( name='nnlo', - version='0.0.5', + version='0.0.7', entry_points = { 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main', 'GetData=nnlo.data.getdata:main', From a122e942ca375893142fbb94fa3f4ae1b219b71e Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 4 Aug 2020 16:48:00 -0400 Subject: [PATCH 15/18] Cleanup models/ --- nnlo/models/BuildModel.py | 51 -------------------- nnlo/models/Models.py | 29 ----------- nnlo/models/TorchModels.py | 75 ----------------------------- nnlo/models/model_topclass_torch.py | 75 +++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 155 deletions(-) delete mode 100644 nnlo/models/BuildModel.py delete mode 100644 nnlo/models/Models.py delete mode 100644 nnlo/models/TorchModels.py diff --git a/nnlo/models/BuildModel.py b/nnlo/models/BuildModel.py deleted file mode 100644 index 5e5499b..0000000 --- a/nnlo/models/BuildModel.py +++ /dev/null @@ -1,51 +0,0 @@ -### Builds one of the available models. -# Saves model architecture to _arch.json -# and model weights to _weights.h5 - -import os -os.environ['CUDA_VISIBLE_DEVICES']="" -import argparse -import logging - -from Models import make_model - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('model_name', help='model to construct') - parser.add_argument('model_args', nargs='*', help='key=value to pass to the model',default=[]) - args = parser.parse_args() - model_name = args.model_name - model_args = {} - for kw in args.model_args: - k,v = kw.split('=') - try: - v = int(v) - except: - v= float(v) - model_args[k] = v - if model_args: - logging.info("passing {} to the model builder".format(str(model_args))) - model = make_model( model_name ,**model_args) - else: - model = make_model( model_name) - weights_filename = "%s_weights.h5" % model_name - arch_filename = "%s_arch.json" % model_name - - if not "torch" in model_name: - model.summary() - model.save_weights( weights_filename, overwrite=True ) - logging.info("Saved model weights to {0}".format(weights_filename)) - - model_arch = model.to_json() - with open( arch_filename, 'w' ) as arch_file: - arch_file.write( model_arch ) - logging.info("Saved model architecture to {0}".format(arch_filename)) - else: - import torch - weights_filename = weights_filename.replace('h5','torch') - arch_filename = arch_filename.replace('json','torch') - torch.save(model.state_dict(), weights_filename) - logging.info("Saved model weights to {0}".format(weights_filename)) - torch.save(model, arch_filename) - logging.info("Saved model architecture to {0}".format(arch_filename)) - diff --git a/nnlo/models/Models.py b/nnlo/models/Models.py deleted file mode 100644 index 67c89cd..0000000 --- a/nnlo/models/Models.py +++ /dev/null @@ -1,29 +0,0 @@ -### Predefined Keras models - -import sys -import logging -from nnlo.util.utils import import_keras -import_keras() - -def model_function(model_name): - """Constructs the Keras model indicated by model_name""" - model_maker_dict = { - # 'example':make_example_model, - # 'mnist':make_mnist_model, - # 'cifar10':make_cifar10_model, - # 'mnist_torch':make_mnist_torch_model, - # 'topclass': make_topclass_model, - # 'topclass_torch':make_topclass_torch_model - - } - return model_maker_dict[model_name] -def make_model(model_name, **args): - m_fn = model_function(model_name) - if args and hasattr(m_fn,'parameter_range'): - provided = set(args.keys()) - accepted = set([a.name for a in m_fn.parameter_range]) - if not provided.issubset( accepted ): - logging.error("provided arguments {} do not match the accepted ones {}".format(sorted(provided),sorted(accepted))) - sys.exit(-1) - return model_function(model_name)(**args) - diff --git a/nnlo/models/TorchModels.py b/nnlo/models/TorchModels.py deleted file mode 100644 index 3a9b9bc..0000000 --- a/nnlo/models/TorchModels.py +++ /dev/null @@ -1,75 +0,0 @@ -import torch -from torch.autograd import Variable -import torch.nn as nn -import torch.optim -import torch.utils.data.distributed -import torchvision.transforms as transforms -import torchvision.datasets as datasets -import torchvision.models as models -import torch.nn.functional as F -import numpy -import logging - - -### Build a customized CNN with given hyperparameters -class _ConvBlock(nn.Sequential): - def __init__(self, conv_layers, dropout, in_ch=5): - super().__init__() - for i in range(conv_layers): - channel_in = in_ch*((i+1)%5) - channel_out = in_ch*((i+2)%5) - if channel_in == 0: channel_in += 1 - if channel_out == 0: channel_out += 1 - self.add_module('convlayer%d'%(i), nn.Conv2d(channel_in, out_channels=channel_out,kernel_size=(3,3),stride=1, padding=1)) - self.add_module('relu%d'%(i), nn.ReLU(inplace=True)) - self.add_module('convlayer%d'%(conv_layers), nn.Conv2d(channel_out, out_channels=1, kernel_size=(3,3), stride=2, padding=1)) - self.dropout = dropout - - def forward(self, x): - x = super().forward(x) - if self.dropout > 0: - x = F.dropout(x, p=self.dropout, training=self.training) - return x - -class _DenseBlock(nn.Sequential): - def __init__(self, dense_layers, dropout ,base): - super().__init__() - for i in range(dense_layers): - il = int(base//(2**i)) - ol = int(base//(2**(i+1))) - logging.info("{} =>> {}".format(il,ol)) - self.add_module('denselayer%d'%(i), nn.Linear(il, ol)) - self.add_module('relu%d'%(i), nn.ReLU(inplace=True)) - self.dropout = dropout - - def forward(self, x): - x = super().forward(x) - if self.dropout > 0: - x = F.dropout(x, p=self.dropout, training=self.training) - return x - -class CNN(nn.Module): - def __init__(self, conv_layers=2, dense_layers=2, dropout=0.5, classes=3, in_channels=5): - super().__init__() - self.build_net(conv_layers, dense_layers, dropout, classes, in_channels) - - def build_net(self,*args, **kwargs): - base_2 = 10 - base = base_2**2 - self.conv_layers = _ConvBlock(args[0], args[2], args[4]) - self.dense_layers = _DenseBlock(args[1], args[2], base) - self.adapt_pool = nn.AdaptiveMaxPool2d((base_2,base_2)) - il = int(base//(2**(args[1]))) - ol = int(args[3]) - logging.info("{} =>> {}".format(il,ol)) - self.output = nn.Linear(il, ol) - - def forward(self, x): - x = x.permute(0,3,1,2).float() - x = self.conv_layers(x) - x = self.adapt_pool(x) - x = x.view(x.shape[0], -1) # flatten - x = self.dense_layers(x) - return self.output(x) - - diff --git a/nnlo/models/model_topclass_torch.py b/nnlo/models/model_topclass_torch.py index abca6e6..6560be9 100644 --- a/nnlo/models/model_topclass_torch.py +++ b/nnlo/models/model_topclass_torch.py @@ -2,6 +2,81 @@ # Rui Zhang 8.2020 # rui.zhang@cern.ch +import torch +from torch.autograd import Variable +import torch.nn as nn +import torch.optim +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models +import torch.nn.functional as F +import numpy +import logging + + +### Build a customized CNN with given hyperparameters +class _ConvBlock(nn.Sequential): + def __init__(self, conv_layers, dropout, in_ch=5): + super().__init__() + for i in range(conv_layers): + channel_in = in_ch*((i+1)%5) + channel_out = in_ch*((i+2)%5) + if channel_in == 0: channel_in += 1 + if channel_out == 0: channel_out += 1 + self.add_module('convlayer%d'%(i), nn.Conv2d(channel_in, out_channels=channel_out,kernel_size=(3,3),stride=1, padding=1)) + self.add_module('relu%d'%(i), nn.ReLU(inplace=True)) + self.add_module('convlayer%d'%(conv_layers), nn.Conv2d(channel_out, out_channels=1, kernel_size=(3,3), stride=2, padding=1)) + self.dropout = dropout + + def forward(self, x): + x = super().forward(x) + if self.dropout > 0: + x = F.dropout(x, p=self.dropout, training=self.training) + return x + +class _DenseBlock(nn.Sequential): + def __init__(self, dense_layers, dropout ,base): + super().__init__() + for i in range(dense_layers): + il = int(base//(2**i)) + ol = int(base//(2**(i+1))) + logging.info("{} =>> {}".format(il,ol)) + self.add_module('denselayer%d'%(i), nn.Linear(il, ol)) + self.add_module('relu%d'%(i), nn.ReLU(inplace=True)) + self.dropout = dropout + + def forward(self, x): + x = super().forward(x) + if self.dropout > 0: + x = F.dropout(x, p=self.dropout, training=self.training) + return x + +class CNN(nn.Module): + def __init__(self, conv_layers=2, dense_layers=2, dropout=0.5, classes=3, in_channels=5): + super().__init__() + self.build_net(conv_layers, dense_layers, dropout, classes, in_channels) + + def build_net(self,*args, **kwargs): + base_2 = 10 + base = base_2**2 + self.conv_layers = _ConvBlock(args[0], args[2], args[4]) + self.dense_layers = _DenseBlock(args[1], args[2], base) + self.adapt_pool = nn.AdaptiveMaxPool2d((base_2,base_2)) + il = int(base//(2**(args[1]))) + ol = int(args[3]) + logging.info("{} =>> {}".format(il,ol)) + self.output = nn.Linear(il, ol) + + def forward(self, x): + x = x.permute(0,3,1,2).float() + x = self.conv_layers(x) + x = self.adapt_pool(x) + x = x.view(x.shape[0], -1) # flatten + x = self.dense_layers(x) + return self.output(x) + + def get_name(): return 'topclass_torch' From 1312521df25b97292ec6e87cd1a21c73b2842c9c Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Wed, 5 Aug 2020 05:04:57 -0400 Subject: [PATCH 16/18] Remove dependence on module name in TrainingDriver; support local py file for model definition (need to register local path to /sw/summit/xalt/1.2.0/site:/sw/summit/xalt/1.2.0/libexec) --- nnlo/driver/TrainingDriver.py | 25 +++++++++++-------------- nnlo/models/model_cifar10_tf.py | 3 --- nnlo/models/model_example_tf.py | 3 --- nnlo/models/model_hls4mlgru.py | 3 --- nnlo/models/model_jedi_torch.py | 3 --- nnlo/models/model_mnist_tf.py | 3 --- nnlo/models/model_mnist_torch.py | 3 --- nnlo/models/model_topclass_tf.py | 3 --- nnlo/models/model_topclass_torch.py | 3 --- nnlo/train/model.py | 5 +++-- 10 files changed, 14 insertions(+), 40 deletions(-) diff --git a/nnlo/driver/TrainingDriver.py b/nnlo/driver/TrainingDriver.py index 01be474..d282271 100755 --- a/nnlo/driver/TrainingDriver.py +++ b/nnlo/driver/TrainingDriver.py @@ -91,7 +91,7 @@ def add_train_options(parser): parser.add_argument('--thread_validation', help='run a single process', action='store_true') # model arguments - parser.add_argument('--model', choices=['mnist', 'mnist_torch', 'cifar10', 'cifar10_torch'], help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') + parser.add_argument('--model', required=True, help='File containing model architecture (serialized in JSON/pickle, or provided in a .py file') parser.add_argument('--trial-name', help='descriptive name for trial', default='train', dest='trial_name') @@ -198,7 +198,7 @@ def make_algo( args, use_tf, comm, validate_every ): logging.info("%s not supported mode", args.mode) return algo -def make_train_val_lists(m_module, args): +def make_train_val_lists(args): train_list = val_list = [] with open(args.train_data) as train_list_file: train_list = [ s.strip() for s in train_list_file.readlines() ] @@ -221,22 +221,21 @@ def main(): if 'torch' in args.model: a_backend = 'torch' - m_module, model_source = None, None + model_source = None try: if args.model == 'mnist': - m_module = importlib.import_module(f'nnlo.models.model_mnist_tf') - model_source = 'models/model_mnist_tf.py' + model_source = 'nnlo/models/model_mnist_tf.py' elif args.model == 'mnist_torch': - m_module = importlib.import_module(f'nnlo.models.model_mnist_torch') - model_source = 'models/model_mnist_torch.py' + model_source = 'nnlo/models/model_mnist_torch.py' elif args.model == 'cifar10': - m_module = importlib.import_module(f'nnlo.models.model_cifar10_tf') - model_source = 'models/model_cifar10_tf.py' + model_source = 'nnlo/models/model_cifar10_tf.py' + elif args.model.endswith('py'): + model_source = args.model except Exception as e: logging.fatal(e) (features_name, labels_name) = args.features_name, args.labels_name - (train_list, val_list) = make_train_val_lists(m_module, args) + (train_list, val_list) = make_train_val_lists(args) comm = MPI.COMM_WORLD.Dup() if args.timeline: Timeline.enable() @@ -246,8 +245,6 @@ def main(): model_weights = make_model_weight(args, use_torch) - # Theano is the default backend; use tensorflow if --tf is specified. - # In the theano case it is necessary to specify the device before importing. device = get_device( comm, args.n_masters, gpu_limit=args.max_gpus, gpu_for_master=args.master_gpu) os.environ['CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else '' @@ -313,8 +310,8 @@ def main(): checkpoint=args.checkpoint, checkpoint_interval=args.checkpoint_interval) - if m_module: - model_name =m_module.get_name() + if model_source: + model_name = os.path.basename(model_source).replace('.py','') else: model_name = os.path.basename(args.model).replace('.json','') diff --git a/nnlo/models/model_cifar10_tf.py b/nnlo/models/model_cifar10_tf.py index 7b693c4..8cbf414 100644 --- a/nnlo/models/model_cifar10_tf.py +++ b/nnlo/models/model_cifar10_tf.py @@ -2,9 +2,6 @@ # Rui Zhang 8.2020 # rui.zhang@cern.ch -def get_name(): - return 'cifar10' - def get_model(**args): from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute diff --git a/nnlo/models/model_example_tf.py b/nnlo/models/model_example_tf.py index 43d8c2e..792677c 100644 --- a/nnlo/models/model_example_tf.py +++ b/nnlo/models/model_example_tf.py @@ -2,9 +2,6 @@ # Rui Zhang 8.2020 # rui.zhang@cern.ch -def get_name(): - return 'example' - def get_model(**args): """Example model from keras documentation""" from tensorflow.keras.models import Sequential diff --git a/nnlo/models/model_hls4mlgru.py b/nnlo/models/model_hls4mlgru.py index 2eb3783..a61ac3c 100644 --- a/nnlo/models/model_hls4mlgru.py +++ b/nnlo/models/model_hls4mlgru.py @@ -32,9 +32,6 @@ def get_model(**args): # loss='categorical_crossentropy', metrics=['acc']) return model -def get_name(): - return 'hls4ml-gru' - def get_all(): import os,glob diff --git a/nnlo/models/model_jedi_torch.py b/nnlo/models/model_jedi_torch.py index cb6fa6b..6e644cf 100644 --- a/nnlo/models/model_jedi_torch.py +++ b/nnlo/models/model_jedi_torch.py @@ -158,9 +158,6 @@ def get_model(**args): return mymodel -def get_name(): - return 'hls4ml-jedi' - def get_all(): import os,glob diff --git a/nnlo/models/model_mnist_tf.py b/nnlo/models/model_mnist_tf.py index 8c15019..ce9c418 100644 --- a/nnlo/models/model_mnist_tf.py +++ b/nnlo/models/model_mnist_tf.py @@ -2,9 +2,6 @@ # Rui Zhang 8.2020 # rui.zhang@cern.ch -def get_name(): - return 'mnist' - def get_model(**args): from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute diff --git a/nnlo/models/model_mnist_torch.py b/nnlo/models/model_mnist_torch.py index e0a6df6..a945e57 100644 --- a/nnlo/models/model_mnist_torch.py +++ b/nnlo/models/model_mnist_torch.py @@ -4,9 +4,6 @@ import torch.nn as nn import torch.nn.functional as F -def get_name(): - return 'mnist_torch' - class MNistNet(nn.Module): def __init__(self, **args): super(MNistNet, self).__init__() diff --git a/nnlo/models/model_topclass_tf.py b/nnlo/models/model_topclass_tf.py index 18cc99b..f48e923 100644 --- a/nnlo/models/model_topclass_tf.py +++ b/nnlo/models/model_topclass_tf.py @@ -2,9 +2,6 @@ # Rui Zhang 8.2020 # rui.zhang@cern.ch -def get_name(): - return 'topclass' - def get_model(**args): from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input, Permute diff --git a/nnlo/models/model_topclass_torch.py b/nnlo/models/model_topclass_torch.py index 6560be9..74cc4b7 100644 --- a/nnlo/models/model_topclass_torch.py +++ b/nnlo/models/model_topclass_torch.py @@ -77,9 +77,6 @@ def forward(self, x): return self.output(x) -def get_name(): - return 'topclass_torch' - def get_model(**args): if args:logging.debug("receiving arguments {}".format(args)) conv_layers=args.get('conv_layers',2) diff --git a/nnlo/train/model.py b/nnlo/train/model.py index 1607eda..c3f88cf 100644 --- a/nnlo/train/model.py +++ b/nnlo/train/model.py @@ -7,6 +7,7 @@ import sys import six import logging +import importlib def tell_gpu_memory(label): import gpustat @@ -436,7 +437,7 @@ def __init__(self, comm, source, custom_objects={}, weights=None): if isinstance(source, six.string_types): if source.endswith('.py'): - module = __import__('nnlo.'+source.replace('.py','').replace('/', '.'), fromlist=[None]) + module = module = importlib.import_module(source.replace('.py','').replace('/', '.')) self.model = module.get_model() self.filename = None else: @@ -497,7 +498,7 @@ def __init__(self, comm, source, super(ModelPytorch,self).__init__(comm) if isinstance(source, six.string_types): if source.endswith('.py'): - module = __import__('nnlo.'+source.replace('.py','').replace('/', '.'), fromlist=[None]) + module = module = importlib.import_module(source.replace('.py','').replace('/', '.')) self.model = module.get_model() self.filename = None else: From 09f3e9aa617f30cb355c7b8fbf1ad8777ef4dd82 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Wed, 5 Aug 2020 05:56:03 -0400 Subject: [PATCH 17/18] Update README; cleanup TrainingDriver --- README.md | 23 +++++++----- nnlo/driver/MPIGDriver.py | 10 ++--- nnlo/driver/TrainingDriver.py | 70 ++++++++++------------------------- 3 files changed, 39 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 168cc4c..25fa83b 100644 --- a/README.md +++ b/README.md @@ -8,20 +8,25 @@ The original package was implemented by [Dustin Anderson](https://github.com/dua ## Examples -Test with the MNIST dataset, with keras+tensorflow +Install the package ``` pip install nnlo cd NNLO ``` -Example with mnist provided in a python file +Example with mnist using pre-defined model ``` GetData mnist mpirun -np 3 TrainingDriver --model mnist --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_mnist.list --val_data /path/to/test_mnist.list -mpirun -np 3 python TrainingDriver.py --model examples/example_mnist_torch.py --loss categorical_crossentropy --epochs 3 jsrun -n 3 -g 1 TrainingDriver --model mnist --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_mnist.list --val_data /path/to/test_mnist.list ``` +Example with mnist using user-defined model +``` +export PYTHONPATH=/path/to/:$PYTHONPATH +mpirun -np 3 TrainingDriver --model /path/to/mymodel.py --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_mnist.list --val_data /path/to/test_mnist.list +jsrun -n 3 -g 1 TrainingDriver --model /path/to/mymodel.py --loss categorical_crossentropy --epochs 3 --trial-name n3g1epoch3 --train_data /path/to/train_mnist.list --val_data /path/to/test_mnist.list +``` -Example with the cifar10 with model json +Example with the cifar10 using pre-defined model ``` GetData cifar10 python3 models/get_cifar10.py @@ -72,7 +77,7 @@ The provided `TrainingDriver.py` script handles the case of a model that is spec #### Model Use the ModelBuilder class to specify how your model should be constructed: -[mpi_learn/train/model.py](mpi_learn/train/model.py) +[nnlo/train/model.py](nnlo/train/model.py) To specify your model, create a new class deriving from ModelBuilder and override the `build_model()` method. This method should take no arguments and return the Keras model you wish to train. @@ -83,7 +88,7 @@ The provided ModelFromJson class is a specialized ModelBuilder that constructs a #### Training/Testing data Use the Data class to specify how batches of training data should be generated: -[mpi_learn/train/data.py](mpi_learn/train/data.py) +[nnlo/train/data.py](nnlo/train/data.py) To specify your training data, create a new class deriving from Data and override the `generate_data()` method. The `generate_data` method should act as follows: - yield batches of training data in the form required for training with Keras, i.e. ( [x1, x2, ...], [y1, y2, ...] ) @@ -96,10 +101,10 @@ Note: `generate_data` should not continue to yield training batches forever; rat #### Optimization Procedure Use the Algo class to configure the details of the training algorithm: -[mpi_learn/train/algo.py](mpi_learn/train/algo.py) +[nnlo/train/algo.py](nnlo/train/algo.py) Provide an instance of the Algo class when you construct the MPIManager (see below). The Algo constructor takes several arguments that specify aspects of the training process: -- `optimizer`: supported arguments are `'sgd'`, `'adadelta'`, `'rmsprop'`, and `'adam'`. For optimizers that have tunable parameters, please specify the values of those parameters as additional arguments (see [mpi_learn/train/optimizer.py](mpi_learn/train/optimizer.py) for details on the individual optimizers) +- `optimizer`: supported arguments are `'sgd'`, `'adadelta'`, `'rmsprop'`, and `'adam'`. For optimizers that have tunable parameters, please specify the values of those parameters as additional arguments (see [nnlo/train/optimizer.py](nnlo/train/optimizer.py) for details on the individual optimizers) - `loss`: loss function, specified as a string, e.g. 'categorical_crossentropy' - `validate_every`: number of gradient updates to process before performing validation. Set to 0 to disable validation. - `sync_every`: number of batches for workers to process between gradient updates (default 1) @@ -125,7 +130,7 @@ Training is initiated by an instance of the MPIManager class, which initializes - `train_list`, `val_list`: lists of inputs files to use for training and validation. Each MPI process should be able to access any/all of the input files; the MPIManager will split the input files among the available worker processes. - `callbacks`: list of `keras` callback objects, to be executed by the master process -Other options are available as well: see [mpi_learn/mpi/manager.py](mpi_learn/mpi/manager.py) +Other options are available as well: see [nnlo/mpi/manager.py](nnlo/mpi/manager.py) ### Training algorithm overview diff --git a/nnlo/driver/MPIGDriver.py b/nnlo/driver/MPIGDriver.py index cd39854..ea62326 100755 --- a/nnlo/driver/MPIGDriver.py +++ b/nnlo/driver/MPIGDriver.py @@ -13,9 +13,9 @@ from time import time,sleep from nnlo.mpi.manager import MPIManager, get_device -from nnlo.train.algo import Algo -from nnlo.train.data import H5Data -from nnlo.train.model import ModelFromJson, ModelTensorFlow +#from nnlo.train.algo import Algo +#from nnlo.train.data import H5Data +#from nnlo.train.model import ModelFromJson, ModelTensorFlow from nnlo.util.utils import import_keras from nnlo.util.logger import initialize_logger import socket @@ -112,8 +112,8 @@ def main(): logging.info(backend) if use_tf: - import_keras() - import keras.backend as K + #import_keras() + import tensorflow.keras.backend as K gpu_options=K.tf.GPUOptions( per_process_gpu_memory_fraction=0.0, allow_growth = True,) diff --git a/nnlo/driver/TrainingDriver.py b/nnlo/driver/TrainingDriver.py index d282271..3205acd 100755 --- a/nnlo/driver/TrainingDriver.py +++ b/nnlo/driver/TrainingDriver.py @@ -22,8 +22,6 @@ from nnlo.util.timeline import Timeline from nnlo.util.logger import initialize_logger -def make_Block_Parser(): - pass def add_log_option(parser): # logging configuration parser.add_argument('--log-file', default=None, dest='log_file', help='log file to write, in additon to output stream') @@ -151,7 +149,7 @@ def make_loader( args, features_name, labels_name, train_list): return data -def make_model_weight(args, use_torch): +def make_model_weight(args, backend): model_weights = None if args.restore: args.restore = re.sub(r'\.algo$', '', args.restore) @@ -159,22 +157,26 @@ def make_model_weight(args, use_torch): with open(args.restore + '.latest', 'r') as latest: args.restore = latest.read().splitlines()[-1] if any([os.path.isfile(ff) for ff in glob.glob('./*'+args.restore + '.model')]): - if use_torch: + if backend == 'torch': args.model = args.restore + '.model' model_weights = args.restore +'.model_w' - else: + elif backend == 'tf': model_weights = args.restore + '.model' + else: + logging.error("%s backend not supported", backend) return model_weights -def make_algo( args, use_tf, comm, validate_every ): +def make_algo( args, backend, comm, validate_every ): args_opt = args.optimizer - if use_tf: - if not args_opt.endswith("tf"): + if backend == 'tf': + if not args_opt.endswith('tf'): args_opt = args_opt + 'tf' - else: - if not args_opt.endswith("torch"): + elif backend == 'torch': + if not args_opt.endswith('torch'): args_opt = args_opt + 'torch' + else: + logging.error("%s backend not supported", backend) if args.mode == 'easgd': algo = Algo(None, loss=args.loss, validate_every=validate_every, @@ -217,10 +219,8 @@ def main(): args = parser.parse_args() initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level) - a_backend = args.backend - if 'torch' in args.model: - a_backend = 'torch' - + backend = 'torch' if 'torch' in args.model else 'tf' + model_source = None try: if args.model == 'mnist': @@ -240,60 +240,30 @@ def main(): if args.timeline: Timeline.enable() - use_tf = a_backend == 'keras' - use_torch = not use_tf - - model_weights = make_model_weight(args, use_torch) + model_weights = make_model_weight(args, backend) device = get_device( comm, args.n_masters, gpu_limit=args.max_gpus, gpu_for_master=args.master_gpu) os.environ['CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else '' logging.debug('set to device %s',os.environ['CUDA_VISIBLE_DEVICES']) - if use_torch: + if backend == 'torch': logging.debug("Using pytorch") model_builder = ModelPytorch(comm, source=model_source, weights=model_weights, gpus=1 if 'gpu' in device else 0) - else: + elif backend == 'tf': logging.debug("Using TensorFlow") - os.environ['KERAS_BACKEND'] = 'tensorflow' - import tensorflow as tf - import_keras() - #tf.config.gpu.set_per_process_memory_fraction(0.1) - #gpu_options=K.tf.GPUOptions( - # per_process_gpu_memory_fraction=0.1, #was 0.0 - # allow_growth = True, - # visible_device_list = device[-1] if 'gpu' in device else '') - #gpu_options=K.tf.GPUOptions( - # per_process_gpu_memory_fraction=0.0, - # allow_growth = True,) gpu_devices = tf.config.experimental.list_physical_devices('GPU') for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True) - - #NTHREADS=(2,1) - #NTHREADS=None - #if NTHREADS is None: - # K.set_session( K.tf.Session( config=K.tf.ConfigProto( - # allow_soft_placement=True, log_device_placement=False, - # gpu_options=gpu_options - # ) ) ) - #else: - # K.set_session( K.tf.Session( config=K.tf.ConfigProto( - # allow_soft_placement=True, log_device_placement=False, - # gpu_options=gpu_options, - # intra_op_parallelism_threads=NTHREADS[0], - # inter_op_parallelism_threads=NTHREADS[1], - # ) ) ) - - model_builder = ModelTensorFlow( comm, source=model_source, weights=model_weights) - + else: + logging.error("%s backend not supported", backend) data = make_loader(args, features_name, labels_name, train_list) # Some input arguments may be ignored depending on chosen algorithm - algo = make_algo( args, use_tf, comm, validate_every=int(data.count_data()/args.batch )) + algo = make_algo( args, backend, comm, validate_every=int(data.count_data()/args.batch )) if args.restore: algo.load(args.restore) From 9697df43551e5742ef60e14808eda5475fd00d8b Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Wed, 5 Aug 2020 13:00:51 -0400 Subject: [PATCH 18/18] Add PlotLoss to check loss/accuracy curves --- README.md | 8 ++++ nnlo/util/plot_loss.py | 50 ++++++++++++++++++++ nnlo/util/{count_epoch.py => print_table.py} | 2 +- setup.py | 3 +- 4 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 nnlo/util/plot_loss.py rename nnlo/util/{count_epoch.py => print_table.py} (94%) diff --git a/README.md b/README.md index 25fa83b..6ccd65e 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,14 @@ mpirun -tag-output -n 3 python3 MPIGDriver.py dummy.json train_3d.list test_1_3d See `TrainingDriver.py` for supported optional arguments. Run the script via `mpirun` or `mpiexec`. It automatically detects available NVIDIA GPUs and allocate them among the MPI worker processes. +## Analyse scaling + +After running jobs with multiple GPUs, a number of `model_*_history.json` files are created +``` +PrintTable model_*_history.json +PlotLoss model_*_history.json +``` + ## Customizing the training process The provided `TrainingDriver.py` script handles the case of a model that is specified in JSON format and training data that is stored in HDF5 files. However, the construction of the model and the loading of input data are easily customized. diff --git a/nnlo/util/plot_loss.py b/nnlo/util/plot_loss.py new file mode 100644 index 0000000..84a296e --- /dev/null +++ b/nnlo/util/plot_loss.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# Rui Zhang 8.2020 +# rui.zhang@cern.ch + +import json +import logging +import sys +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') + +def plotLoss(data, outname='', variable='loss'): + nGPU = len(data["history"].keys()) + for irank, values in data["history"].items(): + irank = irank.split(':')[0] + if irank.startswith('0'): + plt.plot(values['val_'+variable], linestyle='-', label=f'R{irank}: val') + if variable in values: + plt.plot(values[variable], linestyle=':', label=f'R{irank}: train') + else: + plt.plot(values[variable], linestyle='--', label='') + + plt.title(outname) + plt.xlabel('Epochs') + plt.ylabel('Loss') + plt.legend() + plt.savefig(f'{variable}_{outname}.pdf', format='pdf') + logging.info(f'Save {variable}_{outname}.pdf') + plt.clf() + +def main(): + logging.basicConfig(level = logging.INFO) + filename, rows_list = [], [] + try: + filenames = sys.argv[1:] + except: + logging.fatal('Usage: python plot_loss.py [json file name]') + + for filename in filenames: + with open(filename) as f: + logging.info(f'Read {filename}') + data = json.load(f) + outname = filename.replace('.json', '') + for variable in ['loss', 'accuracy']: + plotLoss(data, outname, variable) + + +if __name__ == '__main__': + main() diff --git a/nnlo/util/count_epoch.py b/nnlo/util/print_table.py similarity index 94% rename from nnlo/util/count_epoch.py rename to nnlo/util/print_table.py index 77782c9..1b29a9e 100644 --- a/nnlo/util/count_epoch.py +++ b/nnlo/util/print_table.py @@ -13,7 +13,7 @@ def main(): try: filenames = sys.argv[1:] except: - logging.fatal('python count_epoch.py Usage [json file name]') + logging.fatal('Usage: python print_table.py [json file name]') for filename in filenames: with open(filename) as f: diff --git a/setup.py b/setup.py index f60a4d6..a19b0ae 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,8 @@ entry_points = { 'console_scripts': ['TrainingDriver=nnlo.driver.TrainingDriver:main', 'GetData=nnlo.data.getdata:main', - 'CountEpoch=nnlo.util.count_epoch:main', + 'PrintTable=nnlo.util.print_table:main', + 'PlotLoss=nnlo.util.plot_loss:main', ], }, description='Distributed Machine Learning tool for High Performance Computing',