Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 52 additions & 39 deletions readtofasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,63 +5,76 @@
import sys
import os

def readevents(filename):
events = []
kmers = []
sds = []
f = h5py.File(filename,"r")
def readfasta(filename, calls="2D"):
"""
Reads the fastq from basecalled read, returns label and sequence
"""
with h5py.File(filename, "r") as f:
calls_group = "BaseCalled_"+calls
eventpath = "/Analyses/Basecall_2D_000/"+calls_group
if not eventpath in f:
eventpath = "/Analyses/Basecall_1D_000/"+calls_group

calls = "BaseCalled_2D"
if not "Analyses" in f or not "Basecall_2D_000" in f["Analyses"] or not calls in f["Analyses"]["Basecall_2D_000"]:
raise KeyError("Not present in HDF5 file")
if not "Analyses" in f or not "Basecall_2D_000" in f["Analyses"] or not calls_group in f["Analyses"]["Basecall_2D_000"]:
raise KeyError("Not present in HDF5 file")
if not "Fastq" in f[eventpath]:
raise KeyError("Fastq not found in HDF5 file")
data = f["Analyses"]["Basecall_2D_000"][calls_group]["Fastq"]

if not "Fastq" in f["Analyses"]["Basecall_2D_000"][calls]:
raise KeyError("Fastq not found in HDF5 file")
data = f["Analyses"]["Basecall_2D_000"][calls]["Fastq"]
lines = data[()].splitlines()
seq = lines[1].decode('utf-8')
label = lines[0][1:].strip()
labelSuffix = "_twodirections"
label = label+labelSuffix
label = label.decode('utf-8')
lines = data[()].splitlines()

seq = lines[1].decode('utf-8')

label = lines[0][1:].strip()
label_suffix = "_twodirections" if calls == "2D" else calls
label = label+label_suffix
label = label.decode('utf-8')

return label, seq

def build_filelist(filenames):
""" If given a directory, descend into directory and generate
list of files (but do not descend into subdirectories)."""
file_list = []
for filename in filenames:
if os.path.isfile(filename):
file_list.append(filename)
elif os.path.isdir(filename):
for f in os.listdir(filename):
pathname = os.path.join(filename, f)
if os.path.isfile(pathname) and pathname.endswith('.fast5'):
file_list.append(pathname)
return file_list

if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument('infile', type=str)
parser.add_argument('-o','--output', type=argparse.FileType('w'), default=sys.stdout, help="Specify output file (default:stdout)")
parser.add_argument('-l','--linelength', type=int, default=60)
parser.add_argument('infile', nargs="+", type=str)
parser.add_argument('-l', '--linelength', type=int, default=60)
parser.add_argument('-o', '--output', type=argparse.FileType('w'),
default=sys.stdout,
help="Specify output file (default:stdout)")

args = parser.parse_args()

try:
if os.path.isfile(args.infile):
label, sequence = readevents(args.infile)
label += " "+arg.infile
seqs = [(label, seq)]
elif os.path.isdir(args.infile):
seqs = []
for f in os.listdir(args.infile):
pathname = os.path.join(args.infile, f)
if os.path.isfile(pathname) and pathname.endswith('.fast5'):
label, sequence = readevents(pathname)
seqs.append((label+" "+pathname, sequence))
else:
raise ValueError("Path is not a directory or file: "+args.infile)
except KeyError:
sys.exit(1)
files = build_filelist(args.infile)
seqs = []
for infile in files:
try:
label, sequence = readfasta(infile)
seqs.append((label+" "+infile, sequence))
except:
continue

outf = args.output
infname = args.infile
llen = args.linelength

for label, sequence in seqs:
print(">"+label, file=outf)
for i in range( len(sequence)//llen ):
nchunks = (len(sequence)+llen-1)//llen
for i in range(nchunks):
lstart = i*llen
lend = min(lstart + llen, len(sequence))
print( sequence[lstart:lend], file=outf )
print(sequence[lstart:lend], file=outf)

sys.exit(0)