From 2bd4df8f01ee7c71192e412eabd0f0cd4a7a762e Mon Sep 17 00:00:00 2001 From: hendraet Date: Sat, 8 Feb 2020 10:25:12 +0100 Subject: [PATCH 1/4] Get stuff running again --- .gitignore | 1 + .../arabic_offline.py | 2 +- .../arabic_online.py | 2 +- .../farsi_offline_handwriting/farsi_chars.py | 2 +- examples/online_prediction/.gitignore | 17 ++ examples/online_prediction/check.config | 2 +- examples/online_prediction/check3x1.config | 2 +- examples/online_prediction/online_delta.py | 155 +++++++++--------- examples/online_prediction/requirements.txt | 6 + utils/netcdf_helpers.py | 67 ++++---- utils/normalise_netcdf.py | 120 ++++++++------ 11 files changed, 215 insertions(+), 161 deletions(-) create mode 100644 examples/online_prediction/.gitignore create mode 100644 examples/online_prediction/requirements.txt diff --git a/.gitignore b/.gitignore index 9350668..b24482f 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ gradient_check.build/ Debug Release rnnlib.xcodeproj +.idea diff --git a/examples/arabic_offline_handwriting/arabic_offline.py b/examples/arabic_offline_handwriting/arabic_offline.py index a987704..c9fe8c7 100755 --- a/examples/arabic_offline_handwriting/arabic_offline.py +++ b/examples/arabic_offline_handwriting/arabic_offline.py @@ -167,7 +167,7 @@ def convertToPrimaries (labelString): offset += 1 #create a new .nc file -file = netcdf_helpers.NetCDFFile(outputFilename, 'w') +file = netcdf_helpers.netcdf_file(outputFilename, 'w') #create the dimensions netcdf_helpers.createNcDim(file,'numSeqs',len(seqLengths)) diff --git a/examples/arabic_online_handwriting/arabic_online.py b/examples/arabic_online_handwriting/arabic_online.py index 91f7e4e..a3cecd1 100755 --- a/examples/arabic_online_handwriting/arabic_online.py +++ b/examples/arabic_online_handwriting/arabic_online.py @@ -69,7 +69,7 @@ print labels #create a new .nc file -file = netcdf_helpers.NetCDFFile(ncFilename, 'w') +file = netcdf_helpers.netcdf_file(ncFilename, 'w') #create the dimensions netcdf_helpers.createNcDim(file,'numSeqs',len(seqLengths)) diff --git a/examples/farsi_offline_handwriting/farsi_chars.py b/examples/farsi_offline_handwriting/farsi_chars.py index bb6ddaf..16308c6 100755 --- a/examples/farsi_offline_handwriting/farsi_chars.py +++ b/examples/farsi_offline_handwriting/farsi_chars.py @@ -89,7 +89,7 @@ print labels #create a new .nc file -file = netcdf_helpers.NetCDFFile(ncFilename, 'w') +file = netcdf_helpers.netcdf_file(ncFilename, 'w') #create the dimensions netcdf_helpers.createNcDim(file,'numSeqs',len(seqLengths)) diff --git a/examples/online_prediction/.gitignore b/examples/online_prediction/.gitignore new file mode 100644 index 0000000..46e7203 --- /dev/null +++ b/examples/online_prediction/.gitignore @@ -0,0 +1,17 @@ +### VirtualEnv ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +.Python +pyvenv.cfg +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pip-selfcheck.json +include/ +lib/ +local + diff --git a/examples/online_prediction/check.config b/examples/online_prediction/check.config index 35f5c66..7f9300a 100644 --- a/examples/online_prediction/check.config +++ b/examples/online_prediction/check.config @@ -1,6 +1,6 @@ task prediction hiddenType lstm -trainFile one_line.nc +trainFile online.nc dataFraction 1 maxTestsNoBest 20 hiddenSize 1 diff --git a/examples/online_prediction/check3x1.config b/examples/online_prediction/check3x1.config index 00ab457..af571bb 100644 --- a/examples/online_prediction/check3x1.config +++ b/examples/online_prediction/check3x1.config @@ -1,6 +1,6 @@ task prediction hiddenType lstm -trainFile one_line.nc +trainFile online.nc dataFraction 1 maxTestsNoBest 20 hiddenSize 1,1,1 diff --git a/examples/online_prediction/online_delta.py b/examples/online_prediction/online_delta.py index 9d6e4c4..1aad2f6 100755 --- a/examples/online_prediction/online_delta.py +++ b/examples/online_prediction/online_delta.py @@ -8,32 +8,36 @@ import re from xml.dom.minidom import parse -def Std(array,axis): - if shape(array)[axis]>1: - return (std(array,axis)) - return array + +def Std(array, axis): + if shape(array)[axis] > 1: + return std(array, axis) + return array + + def GetTargetString(strokeFileName): - asciiFileName = re.sub('lineStrokes', 'ascii', strokeFileName) - asciiFileName = re.sub('-[0-9]+\.xml', '.txt', asciiFileName) - try: - lineNr = int(re.search('-([0-9]+)\.xml', strokeFileName).group(1)) - lines = [line.strip() for line in open(asciiFileName)] - return lines[lineNr+lines.index('CSR:') + 1] - except (AttributeError, IndexError) as e: - raise SystemExit - return ' ' + asciiFileName = re.sub('lineStrokes', 'ascii', strokeFileName) + asciiFileName = re.sub('-[0-9]+\.xml', '.txt', asciiFileName) + try: + lineNr = int(re.search('-([0-9]+)\.xml', strokeFileName).group(1)) + lines = [line.strip() for line in open(asciiFileName)] + return lines[lineNr + lines.index('CSR:') + 1] + except (AttributeError, IndexError) as e: + raise SystemExit + return ' ' + -#command line options +# command line options parser = OptionParser() -#parse command line options +# parse command line options (options, args) = parser.parse_args() -if (len(args)<2): - print "usage: -options input_filename output_filename" - print options - sys.exit(2) +if (len(args) < 2): + print "usage: -options input_filename output_filename" + print options + sys.exit(2) -inputFilename = args [0] +inputFilename = args[0] ncFilename = args[1] print options print "input filename", inputFilename @@ -49,67 +53,68 @@ def GetTargetString(strokeFileName): targetSeqDims = [] print "reading data files" for l in file(inputFilename).readlines(): - inkmlfile = l.strip() - if len(inkmlfile): - seqTags.append(inkmlfile) - wordTargetStrings.append(' ') - seqTxt = GetTargetString(inkmlfile) - targetStrings.append(seqTxt) - oldlen = len(inputs) - oldlenPred = len(predictions) - firstCoord = array([]) - for trace in parse(inkmlfile).getElementsByTagName('Stroke'): - for coords in trace.getElementsByTagName('Point'): - pt = array([float(coords.getAttribute('x').strip()), float(coords.getAttribute('y').strip())]) - last = array([float(pt[0]), float(pt[1]), 0.0]) - if len(firstCoord) == 0: firstCoord = last - last = last - firstCoord - inputs.append(last) - inputs[-1][-1] = 1 - predictions.extend(inputs[oldlen+1:]) - predictions.append([float(0.0), float(0.0), float(0.0)]) - seqLengths.append(len(inputs) - oldlen) - predSeqLengths.append(len(predictions) - oldlenPred) - seqDims.append([seqLengths[-1]]) - targetSeqDims.append([predSeqLengths[-1]]) - + inkmlfile = l.strip() + if len(inkmlfile): + seqTags.append(inkmlfile) + wordTargetStrings.append(' ') + seqTxt = GetTargetString(inkmlfile) + targetStrings.append(seqTxt) + oldlen = len(inputs) + oldlenPred = len(predictions) + firstCoord = array([]) + for trace in parse(inkmlfile).getElementsByTagName('Stroke'): + for coords in trace.getElementsByTagName('Point'): + pt = array([float(coords.getAttribute('x').strip()), float(coords.getAttribute('y').strip())]) + last = array([float(pt[0]), float(pt[1]), 0.0]) + if len(firstCoord) == 0: firstCoord = last + last = last - firstCoord + inputs.append(last) + inputs[-1][-1] = 1 + predictions.extend(inputs[oldlen + 1:]) + predictions.append([float(0.0), float(0.0), float(0.0)]) + seqLengths.append(len(inputs) - oldlen) + predSeqLengths.append(len(predictions) - oldlenPred) + seqDims.append([seqLengths[-1]]) + targetSeqDims.append([predSeqLengths[-1]]) firstIx = 0 for i in range(len(seqLengths)): - for k in reversed(range(seqLengths[i])): - if k > 0: - inputs[firstIx + k] = array(inputs[firstIx + k]) - array(inputs[firstIx + k - 1]) - inputs[firstIx + k][-1] = abs(inputs[firstIx + k][-1]) - predictions[firstIx + k - 1 ] = inputs[firstIx + k] - if k == 0: - predictions[firstIx] = inputs[firstIx+1] - inputs[firstIx] = array([0, 0, 0]) - firstIx += seqLengths[i] - + for k in reversed(range(seqLengths[i])): + if k > 0: + inputs[firstIx + k] = array(inputs[firstIx + k]) - array(inputs[firstIx + k - 1]) + inputs[firstIx + k][-1] = abs(inputs[firstIx + k][-1]) + predictions[firstIx + k - 1] = inputs[firstIx + k] + if k == 0: + predictions[firstIx] = inputs[firstIx + 1] + inputs[firstIx] = array([0, 0, 0]) + firstIx += seqLengths[i] -#create a new .nc file +# create a new .nc file print ("open file %s", ncFilename) -file = netcdf_helpers.NetCDFFile(ncFilename, 'w') - -#create the dimensions -netcdf_helpers.createNcDim(file,'numSeqs',len(seqLengths)) -netcdf_helpers.createNcDim(file,'numTimesteps',len(inputs)) -netcdf_helpers.createNcDim(file,'predNumTimesteps',len(predictions)) -netcdf_helpers.createNcDim(file,'inputPattSize',len(inputs[0])) -netcdf_helpers.createNcDim(file,'numDims',1) +nc_file = netcdf_helpers.netcdf_file(ncFilename, 'w') +# create the dimensions +netcdf_helpers.createNcDim(nc_file, 'numSeqs', len(seqLengths)) +netcdf_helpers.createNcDim(nc_file, 'numTimesteps', len(inputs)) +netcdf_helpers.createNcDim(nc_file, 'predNumTimesteps', len(predictions)) +netcdf_helpers.createNcDim(nc_file, 'inputPattSize', len(inputs[0])) +netcdf_helpers.createNcDim(nc_file, 'numDims', 1) -#create the variables -netcdf_helpers.createNcStrings(file,'seqTags',seqTags,('numSeqs','maxSeqTagLength'),'sequence tags') -netcdf_helpers.createNcStrings(file,'targetStrings',targetStrings,('numSeqs','maxTargStringLength'),'target strings') -netcdf_helpers.createNcStrings(file,'wordTargetStrings',wordTargetStrings,('numSeqs','maxWordTargStringLength'),'word target strings') -netcdf_helpers.createNcVar(file,'seqLengths',seqLengths,'i',('numSeqs',),'sequence lengths') -netcdf_helpers.createNcVar(file,'seqDims',seqDims,'i',('numSeqs','numDims'),'sequence dimensions') -netcdf_helpers.createNcVar(file,'inputs',inputs,'f',('numTimesteps','inputPattSize'),'input patterns') -netcdf_helpers.createNcVar(file,'predSeqLengths', predSeqLengths,'i',('numSeqs',),'pred sequence lengths') -netcdf_helpers.createNcVar(file,'targetSeqDims', targetSeqDims,'i',('numSeqs','numDims'),'pred sequence dimensions') -netcdf_helpers.createNcVar(file,'targetPatterns', predictions,'f',('predNumTimesteps','inputPattSize'),'prediction patterns') +# create the variables +netcdf_helpers.createNcStrings(nc_file, 'seqTags', seqTags, ('numSeqs', 'maxSeqTagLength'), 'sequence tags') +netcdf_helpers.createNcStrings(nc_file, 'targetStrings', targetStrings, ('numSeqs', 'maxTargStringLength'), + 'target strings') +netcdf_helpers.createNcStrings(nc_file, 'wordTargetStrings', wordTargetStrings, ('numSeqs', 'maxWordTargStringLength'), + 'word target strings') +netcdf_helpers.createNcVar(nc_file, 'seqLengths', seqLengths, 'i', ('numSeqs',), 'sequence lengths') +netcdf_helpers.createNcVar(nc_file, 'seqDims', seqDims, 'i', ('numSeqs', 'numDims'), 'sequence dimensions') +netcdf_helpers.createNcVar(nc_file, 'inputs', inputs, 'f', ('numTimesteps', 'inputPattSize'), 'input patterns') +netcdf_helpers.createNcVar(nc_file, 'predSeqLengths', predSeqLengths, 'i', ('numSeqs',), 'pred sequence lengths') +netcdf_helpers.createNcVar(nc_file, 'targetSeqDims', targetSeqDims, 'i', ('numSeqs', 'numDims'), + 'pred sequence dimensions') +netcdf_helpers.createNcVar(nc_file, 'targetPatterns', predictions, 'f', ('predNumTimesteps', 'inputPattSize'), + 'prediction patterns') -#write the data to disk +# write the data to disk print "closing file", ncFilename -file.close() +nc_file.close() diff --git a/examples/online_prediction/requirements.txt b/examples/online_prediction/requirements.txt new file mode 100644 index 0000000..40127e6 --- /dev/null +++ b/examples/online_prediction/requirements.txt @@ -0,0 +1,6 @@ +cftime==1.0.4.2 +netCDF4==1.5.3 +numpy==1.16.6 +Pillow==6.2.2 +ScientificPython==2.9.4 +scipy==1.2.3 diff --git a/utils/netcdf_helpers.py b/utils/netcdf_helpers.py index 141e138..4e320d1 100755 --- a/utils/netcdf_helpers.py +++ b/utils/netcdf_helpers.py @@ -1,4 +1,4 @@ -#Copyright 2009,2010 Alex Graves +# Copyright 2009,2010 Alex Graves # # This file is part of RNNLIB. # @@ -15,37 +15,48 @@ # You should have received a copy of the GNU General Public License # along with RNNLIB. If not, see . -from Scientific.IO.NetCDF import NetCDFFile -from numpy import * +# from Scientific.IO.NetCDF import NetCDFFile +import netCDF4 +from scipy.io.netcdf import netcdf_file +import numpy as np -def createNcDim(ncfile,name,d): - print "creating netcdf dimension:",name,d - ncfile.createDimension(name,d) -#assumes ncfile will be written over (opened with 'w') -def createNcVar(ncfile,vname,data,vtype,dims,desc): - print "creating netcdf variable",vname - nc_var = ncfile.createVariable (vname,vtype,dims) - nc_var.longname = desc - nc_var.assignValue(data) - print shape(nc_var) +def createNcDim(ncfile, name, d): + print "creating netcdf dimension:", name, d + ncfile.createDimension(name, d) + + +# assumes ncfile will be written over (opened with 'w') +def createNcVar(ncfile, vname, data, vtype, dims, desc): + print "creating netcdf variable", vname + nc_var = ncfile.createVariable(vname, vtype, dims) + nc_var.longname = desc + if vtype == 'S1': + np_data = data + else: + np_data = np.asarray(data, dtype=nc_var.typecode() + str(nc_var.itemsize())) + assert nc_var.shape == np_data.shape # TODO:remove + nc_var[:] = np_data + print nc_var.shape + def maxLen(strings): - maxLength=0 - for s in strings: - length=len(s) - if (length>maxLength): - maxLength=length - return maxLength - -def createNcStrings(ncfile,vname,strings,dims,desc): - print "wrting strings", vname - maxLength = maxLen(strings) + 1 - nullStrings = [] - for s in strings: - nullStrings.append(list(s) +['\0']*(maxLength - len(s))) - createNcDim(ncfile,dims[1],maxLength) - createNcVar(ncfile,vname,array(nullStrings),'c',dims,desc) + maxLength = 0 + for s in strings: + length = len(s) + if (length > maxLength): + maxLength = length + return maxLength + + +def createNcStrings(ncfile, vname, strings, dims, desc): + str_length = maxLen(strings) + + chars = np.empty((len(strings), str_length), dtype='S' + str(str_length)) + [np.append(chars, netCDF4.stringtoarr(string, str_length, 'S')) for string in strings] + + createNcDim(ncfile, dims[1], str_length) + createNcVar(ncfile, vname, np.array(chars), 'S1', dims, desc) # def createNcString(ncfile,vname,string,dims,desc): # print "writing string",vname diff --git a/utils/normalise_netcdf.py b/utils/normalise_netcdf.py index a68ffaa..0fd1e84 100755 --- a/utils/normalise_netcdf.py +++ b/utils/normalise_netcdf.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#Copyright 2009,2010 Alex Graves +# Copyright 2009,2010 Alex Graves # # This file is part of RNNLIB. # @@ -16,85 +16,99 @@ # You should have received a copy of the GNU General Public License # along with RNNLIB. If not, see . -#!/usr/bin/env python +# !/usr/bin/env python import netcdf_helpers import sys -from numpy import * +import numpy as np from optparse import OptionParser parser = OptionParser("usage: %prog input_filename output_filename") -parser.add_option("-m", "--maxarraysize", action="store", type="int", dest="maxArraySize", default=8000000, help="maximum array size for std and mean calcs") -parser.add_option("-i", "--inputarrayname", action="store", type="string", dest="inputArrayName", default="inputs", help="name of input array") -parser.add_option("-o", "--outputarrayname", action="store", type="string", dest="outputArrayName", default="inputs", help="name of output array") -parser.add_option("-s", "--stdmeanfilename", action="store", type="string", dest="stdMeanFilename", default="", help="file to use for stds and means") -parser.add_option("-b", "--bigfile", action="store_true", dest="bigFile", default=False, help="use memory optimisations for big files? (slower)") -parser.add_option("-c", "--booleanColomn", action="store", type="int", dest="booleanColomn", default=-1, help="dont normalize nth colomn of input array") +parser.add_option("-m", "--maxarraysize", action="store", type="int", dest="maxArraySize", default=8000000, + help="maximum array size for std and mean calcs") +parser.add_option("-i", "--inputarrayname", action="store", type="string", dest="inputArrayName", default="inputs", + help="name of input array") +parser.add_option("-o", "--outputarrayname", action="store", type="string", dest="outputArrayName", default="inputs", + help="name of output array") +parser.add_option("-s", "--stdmeanfilename", action="store", type="string", dest="stdMeanFilename", default="", + help="file to use for stds and means") +parser.add_option("-b", "--bigfile", action="store_true", dest="bigFile", default=False, + help="use memory optimisations for big files? (slower)") +parser.add_option("-c", "--booleanColomn", action="store", type="int", dest="booleanColomn", default=-1, + help="dont normalize nth colomn of input array") + + +def std(array, axis): + if np.shape(array)[axis] > 1: + return np.std(array, axis) + return array -def Std(array,axis): - if shape(array)[axis]>1: - return (std(array,axis)) - return array -#parse command line options +# parse command line options (options, args) = parser.parse_args() print options -if (len(args) != 2): - parser.error("incorrect number of arguments") +if len(args) != 2: + parser.error("incorrect number of arguments") inputFilename = args[0] outputFilename = args[1] print 'inputFilename', inputFilename -infile = netcdf_helpers.NetCDFFile(inputFilename, 'r') +infile = netcdf_helpers.netcdf_file(inputFilename, 'r') print "loading in input array" inputVar = infile.variables[options.inputArrayName] -outputArray = zeros(inputVar.shape, 'f') +outputArray = np.zeros(inputVar.shape, 'f') if options.bigFile: - offset = 0 - step = options.maxArraySize - while offset < inputVar.shape[0]: - max = min (offset+step, inputVar.shape[0]) - outputArray[offset:max] = inputVar[offset:max] - offset += step + offset = 0 + step = options.maxArraySize + while offset < inputVar.shape[0]: + max = min(offset + step, inputVar.shape[0]) + outputArray[offset:max] = inputVar[offset:max] + offset += step else: - outputArray = inputVar.getValue() - -if options.stdMeanFilename <> "": - print "reading std deviations and means from",options.stdMeanFilename - stdMeanFile = netcdf_helpers.NetCDFFile(options.stdMeanFilename, 'r') - inputStds = array(stdMeanFile.variables[options.inputArrayName+'Stds'].getValue()) - inputMeans = array(stdMeanFile.variables[options.inputArrayName+'Means'].getValue()) + outputArray = np.copy(inputVar[:]) + outputArray.flags.writeable = True +if options.stdMeanFilename != "": + print "reading std deviations and means from", options.stdMeanFilename + stdMeanFile = netcdf_helpers.netcdf_file(options.stdMeanFilename, 'r') + inputStds = np.array(stdMeanFile.variables[options.inputArrayName + 'Stds'][:]) + inputMeans = np.array(stdMeanFile.variables[options.inputArrayName + 'Means'][:]) else: - print "calculating std deviations" - inputStds=Std(outputArray[:options.maxArraySize],0) - print "calculating means" - inputMeans=mean(outputArray[:options.maxArraySize],0) + print "calculating std deviations" + inputStds = std(outputArray[:options.maxArraySize], 0) + print "calculating means" + inputMeans = np.mean(outputArray[:options.maxArraySize], 0) + if options.booleanColomn > 0: - print "dont normalize boolean colomn", options.booleanColomn - inputStds[options.booleanColomn] = 1 - inputMeans[options.booleanColomn] = 0 + print "dont normalize boolean column", options.booleanColomn + inputStds[options.booleanColomn] = 1 + inputMeans[options.booleanColomn] = 0 print inputStds print inputMeans for p in range(len(inputStds)): - if (inputStds[p]>0): - if options.bigFile: - offset = 0 - step = options.maxArraySize - while offset < len(outputArray): - max = min (offset+step, len(outputArray)) - outputArray[offset:max,p] = (outputArray[offset:max,p] - inputMeans[p])/inputStds[p] - offset += step - else: - outputArray[:,p]=(outputArray[:,p]-inputMeans[p])/inputStds[p] + if inputStds[p] > 0: + if options.bigFile: + offset = 0 + step = options.maxArraySize + while offset < len(outputArray): + max = min(offset + step, len(outputArray)) + outputArray[offset:max, p] = (outputArray[offset:max, p] - inputMeans[p]) / inputStds[p] + offset += step + else: + outputArray[:, p] = (outputArray[:, p] - inputMeans[p]) / inputStds[p] -outfile = netcdf_helpers.NetCDFFile(outputFilename, 'w') +outfile = netcdf_helpers.netcdf_file(outputFilename, 'w') for d in inputVar.dimensions: - netcdf_helpers.createNcDim(outfile,d,infile.dimensions[d]) + netcdf_helpers.createNcDim(outfile, d, infile.dimensions[d]) if options.stdMeanFilename == "": - netcdf_helpers.createNcVar(outfile,options.outputArrayName+'Means',inputMeans,'f',(inputVar.dimensions[1],),'input means') - netcdf_helpers.createNcVar(outfile,options.outputArrayName+'Stds',inputStds,'f',(inputVar.dimensions[1],),'input std deviations') -netcdf_helpers.createNcVar(outfile,options.outputArrayName,outputArray,'f',inputVar.dimensions,options.inputArrayName+' adjusted for mean 0 and std dev 1') + netcdf_helpers.createNcVar(outfile, options.outputArrayName + 'Means', inputMeans, 'f', (inputVar.dimensions[1],), + 'input means') + netcdf_helpers.createNcVar(outfile, options.outputArrayName + 'Stds', inputStds, 'f', (inputVar.dimensions[1],), + 'input std deviations') +netcdf_helpers.createNcVar(outfile, options.outputArrayName, outputArray, 'f', inputVar.dimensions, + options.inputArrayName + ' adjusted for mean 0 and std dev 1') + +infile.close() outfile.close() From b12f47ef0b229ce50753c031684e984b28ed0a69 Mon Sep 17 00:00:00 2001 From: hendraet Date: Mon, 10 Feb 2020 11:53:44 +0100 Subject: [PATCH 2/4] Refactoring (Fixing memleak, restructuring code, PEP8) --- examples/online_prediction/env.sh | 0 examples/online_prediction/online_delta.py | 267 ++++++++++++-------- examples/online_prediction/requirements.txt | 1 - utils/netcdf_helpers.py | 38 ++- utils/normalise_netcdf.py | 14 +- 5 files changed, 183 insertions(+), 137 deletions(-) mode change 100644 => 100755 examples/online_prediction/env.sh diff --git a/examples/online_prediction/env.sh b/examples/online_prediction/env.sh old mode 100644 new mode 100755 diff --git a/examples/online_prediction/online_delta.py b/examples/online_prediction/online_delta.py index 1aad2f6..1c2ac07 100755 --- a/examples/online_prediction/online_delta.py +++ b/examples/online_prediction/online_delta.py @@ -1,13 +1,13 @@ #!/usr/bin/env python -import netcdf_helpers -from scipy import * -from optparse import OptionParser -import sys -import os import re +import os +from optparse import OptionParser from xml.dom.minidom import parse +import netcdf_helpers +from scipy import * + def Std(array, axis): if shape(array)[axis] > 1: @@ -15,106 +15,161 @@ def Std(array, axis): return array -def GetTargetString(strokeFileName): - asciiFileName = re.sub('lineStrokes', 'ascii', strokeFileName) - asciiFileName = re.sub('-[0-9]+\.xml', '.txt', asciiFileName) +def get_target_string(stroke_file_name): + ascii_file_name = re.sub('lineStrokes', 'ascii', stroke_file_name) + ascii_file_name = re.sub('-[0-9]+\.xml', '.txt', ascii_file_name) try: - lineNr = int(re.search('-([0-9]+)\.xml', strokeFileName).group(1)) - lines = [line.strip() for line in open(asciiFileName)] - return lines[lineNr + lines.index('CSR:') + 1] - except (AttributeError, IndexError) as e: + line_nr = int(re.search('-([0-9]+)\.xml', stroke_file_name).group(1)) + with open(ascii_file_name, 'r') as ascii_file: + lines = [line.strip() for line in ascii_file] + return lines[line_nr + lines.index('CSR:') + 1] + except (AttributeError, IndexError): raise SystemExit - return ' ' - - -# command line options -parser = OptionParser() - -# parse command line options -(options, args) = parser.parse_args() -if (len(args) < 2): - print "usage: -options input_filename output_filename" - print options - sys.exit(2) - -inputFilename = args[0] -ncFilename = args[1] -print options -print "input filename", inputFilename -print "data filename", ncFilename -seqDims = [] -seqLengths = [] -targetStrings = [] -wordTargetStrings = [] -seqTags = [] -inputs = [] -predictions = [] -predSeqLengths = [] -targetSeqDims = [] -print "reading data files" -for l in file(inputFilename).readlines(): - inkmlfile = l.strip() - if len(inkmlfile): - seqTags.append(inkmlfile) - wordTargetStrings.append(' ') - seqTxt = GetTargetString(inkmlfile) - targetStrings.append(seqTxt) - oldlen = len(inputs) - oldlenPred = len(predictions) - firstCoord = array([]) - for trace in parse(inkmlfile).getElementsByTagName('Stroke'): - for coords in trace.getElementsByTagName('Point'): - pt = array([float(coords.getAttribute('x').strip()), float(coords.getAttribute('y').strip())]) - last = array([float(pt[0]), float(pt[1]), 0.0]) - if len(firstCoord) == 0: firstCoord = last - last = last - firstCoord - inputs.append(last) - inputs[-1][-1] = 1 - predictions.extend(inputs[oldlen + 1:]) - predictions.append([float(0.0), float(0.0), float(0.0)]) - seqLengths.append(len(inputs) - oldlen) - predSeqLengths.append(len(predictions) - oldlenPred) - seqDims.append([seqLengths[-1]]) - targetSeqDims.append([predSeqLengths[-1]]) - -firstIx = 0 -for i in range(len(seqLengths)): - for k in reversed(range(seqLengths[i])): - if k > 0: - inputs[firstIx + k] = array(inputs[firstIx + k]) - array(inputs[firstIx + k - 1]) - inputs[firstIx + k][-1] = abs(inputs[firstIx + k][-1]) - predictions[firstIx + k - 1] = inputs[firstIx + k] - if k == 0: - predictions[firstIx] = inputs[firstIx + 1] - inputs[firstIx] = array([0, 0, 0]) - firstIx += seqLengths[i] - -# create a new .nc file -print ("open file %s", ncFilename) -nc_file = netcdf_helpers.netcdf_file(ncFilename, 'w') - -# create the dimensions -netcdf_helpers.createNcDim(nc_file, 'numSeqs', len(seqLengths)) -netcdf_helpers.createNcDim(nc_file, 'numTimesteps', len(inputs)) -netcdf_helpers.createNcDim(nc_file, 'predNumTimesteps', len(predictions)) -netcdf_helpers.createNcDim(nc_file, 'inputPattSize', len(inputs[0])) -netcdf_helpers.createNcDim(nc_file, 'numDims', 1) - -# create the variables -netcdf_helpers.createNcStrings(nc_file, 'seqTags', seqTags, ('numSeqs', 'maxSeqTagLength'), 'sequence tags') -netcdf_helpers.createNcStrings(nc_file, 'targetStrings', targetStrings, ('numSeqs', 'maxTargStringLength'), - 'target strings') -netcdf_helpers.createNcStrings(nc_file, 'wordTargetStrings', wordTargetStrings, ('numSeqs', 'maxWordTargStringLength'), - 'word target strings') -netcdf_helpers.createNcVar(nc_file, 'seqLengths', seqLengths, 'i', ('numSeqs',), 'sequence lengths') -netcdf_helpers.createNcVar(nc_file, 'seqDims', seqDims, 'i', ('numSeqs', 'numDims'), 'sequence dimensions') -netcdf_helpers.createNcVar(nc_file, 'inputs', inputs, 'f', ('numTimesteps', 'inputPattSize'), 'input patterns') -netcdf_helpers.createNcVar(nc_file, 'predSeqLengths', predSeqLengths, 'i', ('numSeqs',), 'pred sequence lengths') -netcdf_helpers.createNcVar(nc_file, 'targetSeqDims', targetSeqDims, 'i', ('numSeqs', 'numDims'), - 'pred sequence dimensions') -netcdf_helpers.createNcVar(nc_file, 'targetPatterns', predictions, 'f', ('predNumTimesteps', 'inputPattSize'), - 'prediction patterns') - -# write the data to disk -print "closing file", ncFilename -nc_file.close() + + +def write_to_file(filename, inputs, pred_seq_lengths, predictions, seq_dims, seq_lengths, seq_tags, target_seq_dims, + target_strings, word_target_strings): + + out_file = netcdf_helpers.netcdf_file(filename, 'w') + + # create the dimensions + netcdf_helpers.create_nc_dim(out_file, 'numSeqs', len(seq_lengths)) + netcdf_helpers.create_nc_dim(out_file, 'numTimesteps', len(inputs)) + netcdf_helpers.create_nc_dim(out_file, 'predNumTimesteps', len(predictions)) + netcdf_helpers.create_nc_dim(out_file, 'inputPattSize', len(inputs[0])) + netcdf_helpers.create_nc_dim(out_file, 'numDims', 1) + + # create the variables + netcdf_helpers.create_nc_strings(out_file, 'seqTags', seq_tags, ('numSeqs', 'maxSeqTagLength'), 'sequence tags') + netcdf_helpers.create_nc_strings(out_file, 'targetStrings', target_strings, ('numSeqs', 'maxTargStringLength'), + 'target strings') + netcdf_helpers.create_nc_strings(out_file, 'wordTargetStrings', word_target_strings, + ('numSeqs', 'maxWordTargStringLength'), + 'word target strings') + netcdf_helpers.create_nc_var(out_file, 'seqLengths', seq_lengths, 'i', ('numSeqs',), 'sequence lengths') + netcdf_helpers.create_nc_var(out_file, 'seqDims', seq_dims, 'i', ('numSeqs', 'numDims'), 'sequence dimensions') + netcdf_helpers.create_nc_var(out_file, 'inputs', inputs, 'f', ('numTimesteps', 'inputPattSize'), 'input patterns') + netcdf_helpers.create_nc_var(out_file, 'predSeqLengths', pred_seq_lengths, 'i', ('numSeqs',), + 'pred sequence lengths') + netcdf_helpers.create_nc_var(out_file, 'targetSeqDims', target_seq_dims, 'i', ('numSeqs', 'numDims'), + 'pred sequence dimensions') + netcdf_helpers.create_nc_var(out_file, 'targetPatterns', predictions, 'f', ('predNumTimesteps', 'inputPattSize'), + 'prediction patterns') + out_file.close() + + +def process_lines(lines, tmp_filename): + seq_dims = [] + seq_lengths = [] + target_strings = [] + word_target_strings = [] + seq_tags = [] + inputs = [] + predictions = [] + pred_seq_lengths = [] + target_seq_dims = [] + + for line in lines: + inkmlfile = line.strip() + if len(inkmlfile): + seq_tags.append(inkmlfile) + word_target_strings.append(' ') + seq_txt = get_target_string(inkmlfile) + target_strings.append(seq_txt) + old_len = len(inputs) + old_len_pred = len(predictions) + first_coord = array([]) + for trace in parse(inkmlfile).getElementsByTagName('Stroke'): + for coords in trace.getElementsByTagName('Point'): + pt = array([float(coords.getAttribute('x').strip()), float(coords.getAttribute('y').strip())]) + last = array([float(pt[0]), float(pt[1]), 0.0]) + if len(first_coord) == 0: + first_coord = last + last = last - first_coord + inputs.append(last) + inputs[-1][-1] = 1 + predictions.extend(inputs[old_len + 1:]) + predictions.append([float(0.0), float(0.0), float(0.0)]) + seq_lengths.append(len(inputs) - old_len) + pred_seq_lengths.append(len(predictions) - old_len_pred) + seq_dims.append([seq_lengths[-1]]) + target_seq_dims.append([pred_seq_lengths[-1]]) + + write_to_file(tmp_filename, inputs, pred_seq_lengths, predictions, seq_dims, seq_lengths, seq_tags, target_seq_dims, + target_strings, word_target_strings) + + +def merge_tmp_files(tmp_files, nc_filename): + print("Merging...") + + seq_dims = [] + seq_lengths = [] + target_strings = [] + word_target_strings = [] + seq_tags = [] + inputs = [] + target_patterns = [] + pred_seq_lengths = [] + target_seq_dims = [] + + for tmp_filename in tmp_files: + tmp_file = netcdf_helpers.netcdf_file(tmp_filename, 'r') + + seq_dims.extend(tmp_file.variables['seqDims'][:]) + seq_lengths.extend(tmp_file.variables['seqLengths'][:]) + target_strings.extend(tmp_file.variables['targetStrings'][:]) + word_target_strings.extend(tmp_file.variables['wordTargetStrings'][:]) + seq_tags.extend(tmp_file.variables['seqTags'][:]) + inputs.extend(tmp_file.variables['inputs'][:]) + target_patterns.extend(tmp_file.variables['targetPatterns'][:]) + pred_seq_lengths.extend(tmp_file.variables['predSeqLengths'][:]) + target_seq_dims.extend(tmp_file.variables['targetSeqDims'][:]) + + tmp_file.close() + os.remove(tmp_filename) + + first_ix = 0 + for i in range(len(seq_lengths)): + for k in reversed(range(seq_lengths[i])): + if k > 0: + inputs[first_ix + k] = array(inputs[first_ix + k]) - array(inputs[first_ix + k - 1]) + inputs[first_ix + k][-1] = abs(inputs[first_ix + k][-1]) + target_patterns[first_ix + k - 1] = inputs[first_ix + k] + if k == 0: + target_patterns[first_ix] = inputs[first_ix + 1] + inputs[first_ix] = array([0, 0, 0]) + first_ix += seq_lengths[i] + + write_to_file(nc_filename, inputs, pred_seq_lengths, target_patterns, seq_dims, seq_lengths, seq_tags, + target_seq_dims, target_strings, word_target_strings) + + +def main(): + # command line options + parser = OptionParser() + + # parse command line options + (options, args) = parser.parse_args() + if len(args) < 2: + print "usage: -options input_filename output_filename" + print options + sys.exit(2) + + input_filename = args[0] + nc_filename = args[1] + + tmp_files = [] + all_lines = file(input_filename).readlines() + batch_size = 500 + for i in range(0, len(all_lines), batch_size): + print("Batch", str(i + 1), "Batch size:", str(batch_size)) + tmp_filename = 'tmp_' + str(int(i / batch_size)) + '.nc' + tmp_files.append(tmp_filename) + line_batch = all_lines[i:i + batch_size] + process_lines(line_batch, tmp_filename) + + merge_tmp_files(tmp_files, nc_filename) + + +if __name__ == '__main__': + main() diff --git a/examples/online_prediction/requirements.txt b/examples/online_prediction/requirements.txt index 40127e6..058dd84 100644 --- a/examples/online_prediction/requirements.txt +++ b/examples/online_prediction/requirements.txt @@ -2,5 +2,4 @@ cftime==1.0.4.2 netCDF4==1.5.3 numpy==1.16.6 Pillow==6.2.2 -ScientificPython==2.9.4 scipy==1.2.3 diff --git a/utils/netcdf_helpers.py b/utils/netcdf_helpers.py index 4e320d1..4272510 100755 --- a/utils/netcdf_helpers.py +++ b/utils/netcdf_helpers.py @@ -15,19 +15,18 @@ # You should have received a copy of the GNU General Public License # along with RNNLIB. If not, see . -# from Scientific.IO.NetCDF import NetCDFFile import netCDF4 -from scipy.io.netcdf import netcdf_file import numpy as np +from scipy.io.netcdf import netcdf_file -def createNcDim(ncfile, name, d): +def create_nc_dim(ncfile, name, d): print "creating netcdf dimension:", name, d ncfile.createDimension(name, d) # assumes ncfile will be written over (opened with 'w') -def createNcVar(ncfile, vname, data, vtype, dims, desc): +def create_nc_var(ncfile, vname, data, vtype, dims, desc): print "creating netcdf variable", vname nc_var = ncfile.createVariable(vname, vtype, dims) nc_var.longname = desc @@ -35,31 +34,24 @@ def createNcVar(ncfile, vname, data, vtype, dims, desc): np_data = data else: np_data = np.asarray(data, dtype=nc_var.typecode() + str(nc_var.itemsize())) - assert nc_var.shape == np_data.shape # TODO:remove nc_var[:] = np_data - print nc_var.shape -def maxLen(strings): - maxLength = 0 +def max_len(strings): + max_length = 0 for s in strings: length = len(s) - if (length > maxLength): - maxLength = length - return maxLength - + if length > max_length: + max_length = length + return max_length -def createNcStrings(ncfile, vname, strings, dims, desc): - str_length = maxLen(strings) - chars = np.empty((len(strings), str_length), dtype='S' + str(str_length)) - [np.append(chars, netCDF4.stringtoarr(string, str_length, 'S')) for string in strings] +def create_nc_strings(ncfile, vname, strings, dims, desc): + str_length = max_len(strings) - createNcDim(ncfile, dims[1], str_length) - createNcVar(ncfile, vname, np.array(chars), 'S1', dims, desc) + chars = np.zeros((len(strings), str_length), dtype='S1') + for i, string in enumerate(strings): + chars[i] = netCDF4.stringtoarr(string, str_length, 'S') -# def createNcString(ncfile,vname,string,dims,desc): -# print "writing string",vname -# nullString = string + '\0' -# createNcDim(ncfile,dims[0],len(nullString)) -# createNcVar(ncfile,vname,nullString,'c',dims,desc) + create_nc_dim(ncfile, dims[1], str_length) + create_nc_var(ncfile, vname, np.array(chars), 'S1', dims, desc) diff --git a/utils/normalise_netcdf.py b/utils/normalise_netcdf.py index 0fd1e84..11fd5f9 100755 --- a/utils/normalise_netcdf.py +++ b/utils/normalise_netcdf.py @@ -100,15 +100,15 @@ def std(array, axis): outfile = netcdf_helpers.netcdf_file(outputFilename, 'w') for d in inputVar.dimensions: - netcdf_helpers.createNcDim(outfile, d, infile.dimensions[d]) + netcdf_helpers.create_nc_dim(outfile, d, infile.dimensions[d]) if options.stdMeanFilename == "": - netcdf_helpers.createNcVar(outfile, options.outputArrayName + 'Means', inputMeans, 'f', (inputVar.dimensions[1],), - 'input means') - netcdf_helpers.createNcVar(outfile, options.outputArrayName + 'Stds', inputStds, 'f', (inputVar.dimensions[1],), - 'input std deviations') -netcdf_helpers.createNcVar(outfile, options.outputArrayName, outputArray, 'f', inputVar.dimensions, - options.inputArrayName + ' adjusted for mean 0 and std dev 1') + netcdf_helpers.create_nc_var(outfile, options.outputArrayName + 'Means', inputMeans, 'f', (inputVar.dimensions[1],), + 'input means') + netcdf_helpers.create_nc_var(outfile, options.outputArrayName + 'Stds', inputStds, 'f', (inputVar.dimensions[1],), + 'input std deviations') +netcdf_helpers.create_nc_var(outfile, options.outputArrayName, outputArray, 'f', inputVar.dimensions, + options.inputArrayName + ' adjusted for mean 0 and std dev 1') infile.close() outfile.close() From b53d4fd7ace0a923b821f30e03926351b69264d7 Mon Sep 17 00:00:00 2001 From: hendraet Date: Mon, 10 Feb 2020 11:59:22 +0100 Subject: [PATCH 3/4] Updates README --- README.md | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 106c2d6..7d34d53 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # Origin The original RNNLIB is hosted at http://sourceforge.net/projects/rnnl -while this "fork" is created to repeat results for the -online handwriting prediction and synthesis reported in -http://arxiv.org/abs/1308.0850. The later by now is Alex Graves's classic +while this "fork" is created to repeat results for the +online handwriting prediction and synthesis reported in +http://arxiv.org/abs/1308.0850. The later by now is Alex Graves's classic paper on LSTM networks showing of what RNN can learn about the structure present in the sequential input. @@ -26,17 +26,18 @@ In addition, the following python packages are needed for the auxiliary scripts * SciPy * PyLab * PIL +* netCDF4 -And this package is needed to create and manipulate netcdf data files with python, and to run the experiments in the 'examples' directory: - -* ScientificPython (NOT Scipy) +The required packages needed for the generation of the example datasets can be installed via the +requirements file that can be found in the `examples/online-prediction` directory. Just execute +`pip install -r requirements.txt` To build RNNLIB do $ cmake -DCMAKE_BUILD_TYPE=Release . $ cmake --build . -Cmake run creates the binary files 'rnnlib', 'rnnsynth' and 'gradient_check' in the current directory. +Cmake run creates the binary files 'rnnlib', 'rnnsynth' and 'gradient_check' in the current directory. It is recommended that you add the directory containing the 'rnnlib' binary to your path, as otherwise the tools in the 'utilities' directory will not work. @@ -44,31 +45,31 @@ as otherwise the tools in the 'utilities' directory will not work. Project files for the integrated development environments can be generated by cmake. Run cmake --help to get list of supported IDEs. - + # Handwriting synthesis -Step in to examples/online_prediction and go through few steps below to prepare the +Step in to examples/online_prediction and go through few steps below to prepare the training data, train the model and eventually plot the results of the synthesis ## Downloading online handwriting dataset -Start by registering and downloading pen strokes data from +Start by registering and downloading pen strokes data from http://www.iam.unibe.ch/~fkiwww/iamondb/data/lineStrokes-all.tar.gz Text lables for strokes can be found here http://www.iam.unibe.ch/~fkiwww/iamondb/data/ascii-all.tar.gz Then unzip ./lineStrokes and ./ascii under examples/online_prediction. -Data format in the downloaded files can not be used as is +Data format in the downloaded files can not be used as is and requires further preprocessing to convert pen coordinates to offsets from previous point and merge them into the single file of netcdf format. ## Preparing the training data -Run ./build_netcdf.sh to split dataset to training and validation sets. +Run ./build_netcdf.sh to split dataset to training and validation sets. The same script does all necessary preprocessing including normalisation -of the input and makes corresponding online.nc and online_validation.nc +of the input and makes corresponding online.nc and online_validation.nc files for use with rnnlib . -Each point in the input sequences from online.nc consists of three numbers: +Each point in the input sequences from online.nc consists of three numbers: the x and y offset from the previous point, and the binary end-of-stroke feature. ## Gradient check @@ -101,7 +102,7 @@ The best solution found is stored in synth1d@