diff --git a/readtofasta.py b/readtofasta.py index ab09c5a..b1cc2b4 100755 --- a/readtofasta.py +++ b/readtofasta.py @@ -5,63 +5,76 @@ import sys import os -def readevents(filename): - events = [] - kmers = [] - sds = [] - f = h5py.File(filename,"r") +def readfasta(filename, calls="2D"): + """ + Reads the fastq from basecalled read, returns label and sequence + """ + with h5py.File(filename, "r") as f: + calls_group = "BaseCalled_"+calls + eventpath = "/Analyses/Basecall_2D_000/"+calls_group + if not eventpath in f: + eventpath = "/Analyses/Basecall_1D_000/"+calls_group - calls = "BaseCalled_2D" - if not "Analyses" in f or not "Basecall_2D_000" in f["Analyses"] or not calls in f["Analyses"]["Basecall_2D_000"]: - raise KeyError("Not present in HDF5 file") + if not "Analyses" in f or not "Basecall_2D_000" in f["Analyses"] or not calls_group in f["Analyses"]["Basecall_2D_000"]: + raise KeyError("Not present in HDF5 file") + if not "Fastq" in f[eventpath]: + raise KeyError("Fastq not found in HDF5 file") + data = f["Analyses"]["Basecall_2D_000"][calls_group]["Fastq"] - if not "Fastq" in f["Analyses"]["Basecall_2D_000"][calls]: - raise KeyError("Fastq not found in HDF5 file") - data = f["Analyses"]["Basecall_2D_000"][calls]["Fastq"] - lines = data[()].splitlines() - seq = lines[1].decode('utf-8') - label = lines[0][1:].strip() - labelSuffix = "_twodirections" - label = label+labelSuffix - label = label.decode('utf-8') + lines = data[()].splitlines() + + seq = lines[1].decode('utf-8') + + label = lines[0][1:].strip() + label_suffix = "_twodirections" if calls == "2D" else calls + label = label+label_suffix + label = label.decode('utf-8') return label, seq +def build_filelist(filenames): + """ If given a directory, descend into directory and generate + list of files (but do not descend into subdirectories).""" + file_list = [] + for filename in filenames: + if os.path.isfile(filename): + file_list.append(filename) + elif os.path.isdir(filename): + for f in os.listdir(filename): + pathname = os.path.join(filename, f) + if os.path.isfile(pathname) and pathname.endswith('.fast5'): + file_list.append(pathname) + return file_list + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('infile', type=str) - parser.add_argument('-o','--output', type=argparse.FileType('w'), default=sys.stdout, help="Specify output file (default:stdout)") - parser.add_argument('-l','--linelength', type=int, default=60) + parser.add_argument('infile', nargs="+", type=str) + parser.add_argument('-l', '--linelength', type=int, default=60) + parser.add_argument('-o', '--output', type=argparse.FileType('w'), + default=sys.stdout, + help="Specify output file (default:stdout)") args = parser.parse_args() - try: - if os.path.isfile(args.infile): - label, sequence = readevents(args.infile) - label += " "+arg.infile - seqs = [(label, seq)] - elif os.path.isdir(args.infile): - seqs = [] - for f in os.listdir(args.infile): - pathname = os.path.join(args.infile, f) - if os.path.isfile(pathname) and pathname.endswith('.fast5'): - label, sequence = readevents(pathname) - seqs.append((label+" "+pathname, sequence)) - else: - raise ValueError("Path is not a directory or file: "+args.infile) - except KeyError: - sys.exit(1) + files = build_filelist(args.infile) + seqs = [] + for infile in files: + try: + label, sequence = readfasta(infile) + seqs.append((label+" "+infile, sequence)) + except: + continue outf = args.output - infname = args.infile llen = args.linelength for label, sequence in seqs: print(">"+label, file=outf) - for i in range( len(sequence)//llen ): + nchunks = (len(sequence)+llen-1)//llen + for i in range(nchunks): lstart = i*llen lend = min(lstart + llen, len(sequence)) - print( sequence[lstart:lend], file=outf ) + print(sequence[lstart:lend], file=outf) sys.exit(0)