From 97f4fde6f4c8808960c748b71cf35393e3d08813 Mon Sep 17 00:00:00 2001 From: Jonathan Dursi Date: Thu, 5 Nov 2015 17:21:56 -0500 Subject: [PATCH 1/2] Modify readtofasta to work with both versions of fast5 files --- readtofasta.py | 91 ++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/readtofasta.py b/readtofasta.py index ab09c5a..dcdf8b0 100755 --- a/readtofasta.py +++ b/readtofasta.py @@ -5,63 +5,76 @@ import sys import os -def readevents(filename): - events = [] - kmers = [] - sds = [] - f = h5py.File(filename,"r") +def readfasta(filename, calls="2D"): + """ + Reads the fastq from basecalled read, returns label and sequence + """ + with h5py.File(filename, "r") as f: + calls_group = "BaseCalled_"+calls + eventpath = "/Analyses/Basecall_2D_000/"+calls_group + if not eventpath in f: + eventpath = "/Analyses/Basecall_1D_000/"+calls_group - calls = "BaseCalled_2D" - if not "Analyses" in f or not "Basecall_2D_000" in f["Analyses"] or not calls in f["Analyses"]["Basecall_2D_000"]: - raise KeyError("Not present in HDF5 file") + if not "Analyses" in f or not "Basecall_2D_000" in f["Analyses"] or not calls_group in f["Analyses"]["Basecall_2D_000"]: + raise KeyError("Not present in HDF5 file") + if not "Fastq" in f[eventpath]: + raise KeyError("Fastq not found in HDF5 file") + data = f["Analyses"]["Basecall_2D_000"][calls_group]["Fastq"] - if not "Fastq" in f["Analyses"]["Basecall_2D_000"][calls]: - raise KeyError("Fastq not found in HDF5 file") - data = f["Analyses"]["Basecall_2D_000"][calls]["Fastq"] - lines = data[()].splitlines() - seq = lines[1].decode('utf-8') - label = lines[0][1:].strip() - labelSuffix = "_twodirections" - label = label+labelSuffix - label = label.decode('utf-8') + lines = data[()].splitlines() + + seq = lines[1].decode('utf-8') + + label = lines[0][1:].strip() + label_suffix = "_twodirections" if calls == "2D" else calls + label = label+label_suffix + label = label.decode('utf-8') return label, seq +def build_filelist(filenames): + """ If given a directory, descend into directory and generate + list of files (but do not descend into subdirectories).""" + file_list = [] + for filename in filenames: + if os.path.isfile(filename): + file_list.append(filename) + elif os.path.isdir(filename): + for f in os.listdir(filename): + pathname = os.path.join(filename, f) + if os.path.isfile(pathname) and pathname.endswith('.fast5'): + file_list.append(pathname) + return file_list + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('infile', type=str) - parser.add_argument('-o','--output', type=argparse.FileType('w'), default=sys.stdout, help="Specify output file (default:stdout)") - parser.add_argument('-l','--linelength', type=int, default=60) + parser.add_argument('infile', nargs="+", type=str) + parser.add_argument('-l', '--linelength', type=int, default=60) + parser.add_argument('-o', '--output', type=argparse.FileType('w'), + default=sys.stdout, + help="Specify output file (default:stdout)") args = parser.parse_args() - try: - if os.path.isfile(args.infile): - label, sequence = readevents(args.infile) - label += " "+arg.infile - seqs = [(label, seq)] - elif os.path.isdir(args.infile): - seqs = [] - for f in os.listdir(args.infile): - pathname = os.path.join(args.infile, f) - if os.path.isfile(pathname) and pathname.endswith('.fast5'): - label, sequence = readevents(pathname) - seqs.append((label+" "+pathname, sequence)) - else: - raise ValueError("Path is not a directory or file: "+args.infile) - except KeyError: - sys.exit(1) + files = build_filelist(args.infile) + seqs = [] + for infile in files: + try: + label, sequence = readfasta(infile) + seqs.append((label+" "+infile, sequence)) + except: + continue outf = args.output - infname = args.infile llen = args.linelength for label, sequence in seqs: print(">"+label, file=outf) - for i in range( len(sequence)//llen ): + nchunks = (len(sequence)+llen-1)/llen + for i in range(nchunks): lstart = i*llen lend = min(lstart + llen, len(sequence)) - print( sequence[lstart:lend], file=outf ) + print(sequence[lstart:lend], file=outf) sys.exit(0) From 52661fbefd905c4dd0a04aa6a7cfb8aaf16ba8bb Mon Sep 17 00:00:00 2001 From: Jonathan Dursi Date: Thu, 5 Nov 2015 17:24:10 -0500 Subject: [PATCH 2/2] Explicitly integer division --- readtofasta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readtofasta.py b/readtofasta.py index dcdf8b0..b1cc2b4 100755 --- a/readtofasta.py +++ b/readtofasta.py @@ -71,7 +71,7 @@ def build_filelist(filenames): for label, sequence in seqs: print(">"+label, file=outf) - nchunks = (len(sequence)+llen-1)/llen + nchunks = (len(sequence)+llen-1)//llen for i in range(nchunks): lstart = i*llen lend = min(lstart + llen, len(sequence))