From f85ac427038ea18678d564ae27bf9f5c9017bd19 Mon Sep 17 00:00:00 2001 From: meesters Date: Wed, 6 Aug 2025 13:54:10 +0200 Subject: [PATCH 1/3] refactor: all scripts --- workflow/scripts/ResultTxt.py | 23 +++-- workflow/scripts/ZINCdownload.py | 1 + workflow/scripts/concatPDBQT.py | 1 + workflow/scripts/downloadZINCsubsets.py | 6 +- workflow/scripts/generateIRODS.py | 6 +- workflow/scripts/makeHistogram.py | 13 ++- workflow/scripts/mergeOutput.py | 15 +-- workflow/scripts/prepareDocking.py | 11 ++- workflow/scripts/prepareReceptor.py | 14 +-- workflow/scripts/sortResult.py | 66 +++++++------ workflow/scripts/splitFile.py | 10 +- workflow/scripts/union_venn.py | 73 ++++++++------ workflow/scripts/venn.py | 124 +++++++++++++----------- 13 files changed, 206 insertions(+), 157 deletions(-) diff --git a/workflow/scripts/ResultTxt.py b/workflow/scripts/ResultTxt.py index 2d30e7b..981b168 100644 --- a/workflow/scripts/ResultTxt.py +++ b/workflow/scripts/ResultTxt.py @@ -5,11 +5,12 @@ import os import pandas as pd -infile = snakemake.input[0] +infile = snakemake.input[0] outfile = snakemake.output[0] linesep = os.linesep + def pairwise(iterable): """ :param iterable: @@ -19,28 +20,29 @@ def pairwise(iterable): next(b, None) return zip(a, b) + def extract_pdbqt(infile): """ :param infile: path to infile :param outfile: path to outfile :return: receptor name, IDs and enthalpies """ - target_IDs = [] - enthalpies = [] - references = set() - with open(infile, encoding='utf-8') as to_parse: + target_IDs = [] + enthalpies = [] + references = set() + with open(infile, encoding="utf-8") as to_parse: # this line reads: 'REMARK RECEPTOR path/to/target.pdbqt' headerline = to_parse.readline() # we want to extract the target name WITHOUT the suffix - receptor = Path(headerline.split(' ')[-1]).stem + receptor = Path(headerline.split(" ")[-1]).stem # we then proceed extracting the enthalpy per ligand # we iterate two line per iteration, because the file is structured, like: # # REMARK VINA RESULT: Enthalpy 1 Enthalpy 2 Enthalpy 3 # REMARK Name = for i, j in pairwise(to_parse): - if 'Name' in j: - ID = j.split(' = ')[-1].strip(linesep) + if "Name" in j: + ID = j.split(" = ")[-1].strip(linesep) if ID not in references: references.add(ID) target_IDs.append(ID) @@ -48,9 +50,10 @@ def extract_pdbqt(infile): return receptor, target_IDs, enthalpies -if __name__ == '__main__': + +if __name__ == "__main__": receptor, target_IDs, enthalpies = extract_pdbqt(infile) # putting into data frame: - out_df = pd.DataFrame(enthalpies, index = target_IDs, columns = [receptor]) + out_df = pd.DataFrame(enthalpies, index=target_IDs, columns=[receptor]) # finally, save the output out_df.to_csv(outfile) diff --git a/workflow/scripts/ZINCdownload.py b/workflow/scripts/ZINCdownload.py index 4410ca0..455e420 100644 --- a/workflow/scripts/ZINCdownload.py +++ b/workflow/scripts/ZINCdownload.py @@ -1,4 +1,5 @@ """download ligands in pdbqt format from ZINC database""" + import os import requests diff --git a/workflow/scripts/concatPDBQT.py b/workflow/scripts/concatPDBQT.py index f7286de..6d32adc 100644 --- a/workflow/scripts/concatPDBQT.py +++ b/workflow/scripts/concatPDBQT.py @@ -1,4 +1,5 @@ """read all files in input folder and concat them together""" + import os from snakemake.shell import shell diff --git a/workflow/scripts/downloadZINCsubsets.py b/workflow/scripts/downloadZINCsubsets.py index 2b650ba..ac8e872 100644 --- a/workflow/scripts/downloadZINCsubsets.py +++ b/workflow/scripts/downloadZINCsubsets.py @@ -1,11 +1,13 @@ -'''download ligand subsets in mol2 format from ZINC database''' +"""download ligand subsets in mol2 format from ZINC database""" import requests out = snakemake.output[0] subset = snakemake.params.sub -url = ''.join(["https://zinc15.docking.org/substances/subsets/", subset, ".mol2?count=all"]) +url = "".join( + ["https://zinc15.docking.org/substances/subsets/", subset, ".mol2?count=all"] +) r = requests.get(url, timeout=60) with open(out, "wb") as outfile: diff --git a/workflow/scripts/generateIRODS.py b/workflow/scripts/generateIRODS.py index 4161460..cbfc3ca 100755 --- a/workflow/scripts/generateIRODS.py +++ b/workflow/scripts/generateIRODS.py @@ -3,7 +3,7 @@ import json import pandas as pd -target1 = snakemake.config["TARGETS"] +target1 = snakemake.config["TARGETS"] targets = snakemake.config["RESCREENING_TARGETS"] name = snakemake.config["EXPERIMENT_NAME"] @@ -37,5 +37,5 @@ out_dict.update(results) -with open(outfile, "w", encoding = "utf-8") as f: - json.dump(out_dict, f, sort_keys = False, indent = 4) +with open(outfile, "w", encoding="utf-8") as f: + json.dump(out_dict, f, sort_keys=False, indent=4) diff --git a/workflow/scripts/makeHistogram.py b/workflow/scripts/makeHistogram.py index c8ef6ac..5ed5ffc 100644 --- a/workflow/scripts/makeHistogram.py +++ b/workflow/scripts/makeHistogram.py @@ -5,12 +5,14 @@ import pandas as pd from matplotlib import pyplot as plt + def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." - a, b= itertools.tee(iterable) + a, b = itertools.tee(iterable) next(b, None) return zip(a, b) + def getHistogram(in_pdbqt, low, high, step): """creates histogram of strutures in_pdbqt. @@ -25,15 +27,15 @@ def getHistogram(in_pdbqt, low, high, step): step : float size of binding energy buckets. """ - f = gzip.open(in_pdbqt, 'r') + f = gzip.open(in_pdbqt, "r") lowerBound = low bins = [] - list1 =[] + list1 = [] while lowerBound < high: bins.append(lowerBound) lowerBound = lowerBound + step bins = [round(num, 2) for num in bins] - for a,b in pairwise(f): + for a, b in pairwise(f): if "REMARK VINA RESULT:" in str(b): if "MODEL 1" in str(a): tmp = b.split() @@ -46,6 +48,7 @@ def getHistogram(in_pdbqt, low, high, step): plt.ylabel("frequency") fig = plt.gcf() fig.set_size_inches(10, 5) - plt.savefig(snakemake.output[0], dpi = 300) + plt.savefig(snakemake.output[0], dpi=300) + getHistogram(snakemake.input[0], -13, -3, 0.2) diff --git a/workflow/scripts/mergeOutput.py b/workflow/scripts/mergeOutput.py index be0d6ec..afdab83 100644 --- a/workflow/scripts/mergeOutput.py +++ b/workflow/scripts/mergeOutput.py @@ -1,18 +1,21 @@ """merge all docking output files together""" + import shutil import subprocess outFile = snakemake.output[0] inFiles = snakemake.input -#TODO: change to ZIPFILE to support better result portability -with open(outFile, 'wb') as outF: +# TODO: change to ZIPFILE to support better result portability +with open(outFile, "wb") as outF: for file in inFiles: try: - out = subprocess.check_output(['gunzip', '-t', file]) #test if gzip file is valid before merging + out = subprocess.check_output( + ["gunzip", "-t", file] + ) # test if gzip file is valid before merging - #TODO: avoid overwriting the file descriptor! - with open(file, 'rb') as sourceFile: + # TODO: avoid overwriting the file descriptor! + with open(file, "rb") as sourceFile: shutil.copyfileobj(sourceFile, outF) except subprocess.CalledProcessError: - print(f'Erros occured during the docking of {file}') + print(f"Erros occured during the docking of {file}") diff --git a/workflow/scripts/prepareDocking.py b/workflow/scripts/prepareDocking.py index d80f4dd..80ed987 100644 --- a/workflow/scripts/prepareDocking.py +++ b/workflow/scripts/prepareDocking.py @@ -2,6 +2,7 @@ preparation of ligand input file for VinaLC containing the path to every ligand with same weigth + logP """ + import os input_directory = snakemake.input.in_dir @@ -15,7 +16,13 @@ for purchase in snakemake.config["ZINC_INPUT"]["PURCHASE"]: for ph in snakemake.config["ZINC_INPUT"]["PH"]: for charge in snakemake.config["ZINC_INPUT"]["CHARGE"]: - fname = weightlog + react + purchase + ph + charge + ".pdbqt" + fname = ( + weightlog + react + purchase + ph + charge + ".pdbqt" + ) ligand_file = os.path.join(input_directory, fname) - with open(os.path.join(snakemake.output.library ),"r", encoding='utf-8') as file_object: + with open( + os.path.join(snakemake.output.library), + "r", + encoding="utf-8", + ) as file_object: file_object.write(ligand_file + "\n") diff --git a/workflow/scripts/prepareReceptor.py b/workflow/scripts/prepareReceptor.py index 31bb4e0..239b81c 100644 --- a/workflow/scripts/prepareReceptor.py +++ b/workflow/scripts/prepareReceptor.py @@ -3,6 +3,7 @@ import os from Bio.PDB import PDBParser, PDBIO + def removeChains(model, chainlist): """ removes chains not specified in chainlist from model @@ -20,7 +21,7 @@ def removeChains(model, chainlist): chain_to_remove = [] for chain in model: for residue in chain: - if residue.id[0] != ' ': + if residue.id[0] != " ": residue_to_remove.append((chain.id, residue.id)) if not chain: chain_to_remove.append(chain.id) @@ -39,9 +40,9 @@ def prepareRec(inputfile, outputfile, target): select chains to delete depending on config definition """ print(target) - ID = target.split(',') + ID = target.split(",") chains = ID[1].split(" ") - parser = PDBParser()#MMCIFParser() + parser = PDBParser() # MMCIFParser() structure = parser.get_structure(ID[0], inputfile) model = structure[0] removeChains(model, chains) @@ -52,12 +53,13 @@ def prepareRec(inputfile, outputfile, target): print("printing outfile") io.save(out) -head,tail = os.path.split(snakemake.input[0]) + +head, tail = os.path.split(snakemake.input[0]) filename = tail.split(".")[0] if any(filename in target for target in snakemake.config["TARGETS"]): - print('filename in targets') - prepareRec(snakemake.input[0], snakemake.output[0], snakemake.config["TARGETS"][0]) + print("filename in targets") + prepareRec(snakemake.input[0], snakemake.output[0], snakemake.config["TARGETS"][0]) elif filename in str(snakemake.config["RESCREENING_TARGETS"]): for target in snakemake.config["RESCREENING_TARGETS"]: diff --git a/workflow/scripts/sortResult.py b/workflow/scripts/sortResult.py index bd6afe2..6506e16 100644 --- a/workflow/scripts/sortResult.py +++ b/workflow/scripts/sortResult.py @@ -1,19 +1,21 @@ """sorting and outputting the N>1 best results from a docking run""" + import subprocess import gzip import os from math import ceil + inPath = snakemake.input[0] outFile = snakemake.output[0] num = float(snakemake.config["RESULT_NUMBER"]) -count=0 -lineN=0 -ids=0 -swapFlg=False -scoreList=[] -strucList=[] -tempList=[] +count = 0 +lineN = 0 +ids = 0 +swapFlg = False +scoreList = [] +strucList = [] +tempList = [] getLigNum = "zgrep -c 'REMARK RECEPTOR' " + inPath ligand_num = int(subprocess.check_output(getLigNum, shell=True)) @@ -21,42 +23,42 @@ if num > 1: listSize = num else: - total=float(ligand_num) - listSize = ceil(num*total) + total = float(ligand_num) + listSize = ceil(num * total) -with gzip.open(inPath, 'rt', encoding='utf-8') as inFile: +with gzip.open(inPath, "rt", encoding="utf-8") as inFile: for line in inFile: if "REMARK RECEPTOR" in line: - lineN=0 - if count>0: - if count<=listSize: + lineN = 0 + if count > 0: + if count <= listSize: strucList.append(tempList) elif swapFlg: - strucList[ids]=tempList - swapFlg=False - count=count+1 - tempList=[] - if lineN==3: - strs=line.split() - curValue=float(strs[3]) - if count<=listSize: + strucList[ids] = tempList + swapFlg = False + count = count + 1 + tempList = [] + if lineN == 3: + strs = line.split() + curValue = float(strs[3]) + if count <= listSize: scoreList.append(curValue) else: - maxValue=max(scoreList) - if curValue (s0,s1), (s1,s2), (s2, s3), ..." - a, b= itertools.tee(iterable) + a, b = itertools.tee(iterable) next(b, None) return zip(a, b) + def group3(iterable): - '''iterates over 3 lines at once''' + """iterates over 3 lines at once""" a, b, c = itertools.tee(iterable, 3) next(b, None) next(c, None) next(c, None) return zip(a, b, c) + def extract_pdbqt(infile): """ :param infile: path to infile :param outfile: path to outfile :return: receptor name, IDs and enthalpies """ - target_IDs = [] - enthalpies = [] - references = set() - with open(infile, encoding='utf-8') as to_parse: + target_IDs = [] + enthalpies = [] + references = set() + with open(infile, encoding="utf-8") as to_parse: # this line reads: 'REMARK RECEPTOR path/to/target.pdbqt' headerline = to_parse.readline() # we want to extract the target name WITHOUT the suffix - receptor = Path(headerline.split(' ')[-1]).stem + receptor = Path(headerline.split(" ")[-1]).stem # we then proceed extracting the enthalpy per ligand # we iterate two line per iteration, because the file is structured, like: # # REMARK VINA RESULT: Enthalpy 1 Enthalpy 2 Enthalpy 3 # REMARK Name = for i, j in pairwise(to_parse): - if 'Name' in j: - ID = j.split(' = ')[-1].strip(linesep) + if "Name" in j: + ID = j.split(" = ")[-1].strip(linesep) if ID not in references: references.add(ID) target_IDs.append(ID) @@ -53,28 +56,29 @@ def extract_pdbqt(infile): return receptor, target_IDs, enthalpies + def extract_pdbqt_rescreening(infile): """ :param infile: path to infile :param outfile: path to outfile :return: receptor name, IDs and enthalpies """ - target_IDs = [] - enthalpies = [] - references = set() - with open(infile, encoding='utf-8') as to_parse: + target_IDs = [] + enthalpies = [] + references = set() + with open(infile, encoding="utf-8") as to_parse: # this line reads: 'REMARK RECEPTOR path/to/target.pdbqt' headerline = to_parse.readline() # we want to extract the target name WITHOUT the suffix - receptor = Path(headerline.split(' ')[-1]).stem + receptor = Path(headerline.split(" ")[-1]).stem # we then proceed extracting the enthalpy per ligand # we iterate two line per iteration, because the file is structured, like: # # REMARK VINA RESULT: Enthalpy 1 Enthalpy 2 Enthalpy 3 # REMARK Name = for a, _, j in group3(to_parse): - if 'Name' in j: - ID = j.split(' = ')[-1].strip(linesep) + if "Name" in j: + ID = j.split(" = ")[-1].strip(linesep) if ID not in references: references.add(ID) target_IDs.append(ID) @@ -82,50 +86,55 @@ def extract_pdbqt_rescreening(infile): return receptor, target_IDs, enthalpies + def unionVenn(bestLigandFile, outFile, reBest): - '''outputs union table of ''' + """outputs union table of""" ID = [] value = [] receptor, ID, value = extract_pdbqt(bestLigandFile) - out_df = pd.DataFrame(value, index = ID, columns = [receptor]) + out_df = pd.DataFrame(value, index=ID, columns=[receptor]) inputArray = [] for inFile in reBest: inputArray.append(inFile) - for i in inputArray: #join all rescreening results to first screening + for i in inputArray: # join all rescreening results to first screening receptor, ID, value = extract_pdbqt_rescreening(i) - df = pd.DataFrame(value, index = ID, columns = [receptor]) - out_df = out_df.join(df, how = 'inner') + df = pd.DataFrame(value, index=ID, columns=[receptor]) + out_df = out_df.join(df, how="inner") out_df.to_csv(outFile) + def getLigands(result_file): - """ take results file and outputs ligand names and receptor name """ + """take results file and outputs ligand names and receptor name""" ligandNames = [] - with open(result_file, encoding = 'utf-8') as result: + with open(result_file, encoding="utf-8") as result: for line in result: - if 'REMARK RECEPTOR' in line: - receptor = line.split('/')[-1] - if 'Name' in line: + if "REMARK RECEPTOR" in line: + receptor = line.split("/")[-1] + if "Name" in line: ligandNames.append(line.split()[-1]) return ligandNames, receptor + + def makeVenn(result_list): """takes rescreening results and outputs a venn diagram""" ligand_lists = [] receptor_list = [] for result in result_list: - ligands,receptor = getLigands(result) + ligands, receptor = getLigands(result) ligand_lists.append(ligands) receptor_list.append(receptor) - venn.venn(ligand_lists, snakemake.output[1], receptor_list, figsize=(16,16)) + venn.venn(ligand_lists, snakemake.output[1], receptor_list, figsize=(16, 16)) + -#rescreen_results = str(rescreen_results).split() +# rescreen_results = str(rescreen_results).split() if 1 < len(rescreen_results) <= 4: makeVenn(rescreen_results) else: - open(snakemake.output[1], 'a', encoding = 'utf-8').close() -unionVenn( snakemake.input[0], snakemake.output[0], rescreen_results) + open(snakemake.output[1], "a", encoding="utf-8").close() +unionVenn(snakemake.input[0], snakemake.output[0], rescreen_results) diff --git a/workflow/scripts/venn.py b/workflow/scripts/venn.py index 58abe8f..aa50ca3 100644 --- a/workflow/scripts/venn.py +++ b/workflow/scripts/venn.py @@ -1,16 +1,17 @@ """ creating venn diagrams""" - from itertools import chain from collections.abc import Iterable from matplotlib.patches import Circle, Ellipse import pylab + # -------------------------------------------------------------------- -alignment = {'horizontalalignment': 'center', 'verticalalignment': 'baseline'} +alignment = {"horizontalalignment": "center", "verticalalignment": "baseline"} # ------------------------------------------------------------------- + def venn(data, outfile, names=None, fill="number", **kwds): """ data: a list @@ -23,11 +24,11 @@ def venn(data, outfile, names=None, fill="number", **kwds): if data is None: raise Exception("No data!") if len(data) == 2: - venn2(data, outfile, names, fill,**kwds) + venn2(data, outfile, names, fill, **kwds) elif len(data) == 3: - venn3(data, outfile, names, fill,**kwds) + venn3(data, outfile, names, fill, **kwds) elif len(data) == 4: - venn4(data, outfile, names, fill,**kwds) + venn4(data, outfile, names, fill, **kwds) else: raise Exception("currently only 2-4 sets venn diagrams are supported") @@ -56,15 +57,15 @@ def get_labels(data, fill="number"): N = len(data) sets_data = [set(data[i]) for i in range(N)] # sets for separate groups - s_all = set(chain(*data)) # union of all sets + s_all = set(chain(*data)) # union of all sets # bin(3) --> '0b11', so bin(3).split('0b')[-1] will remove "0b" set_collections = {} for n in range(1, 2**N): - key = bin(n).split('0b')[-1].zfill(N) + key = bin(n).split("0b")[-1].zfill(N) value = s_all - sets_for_intersection = [sets_data[i] for i in range(N) if key[i] == '1'] - sets_for_difference = [sets_data[i] for i in range(N) if key[i] == '0'] + sets_for_intersection = [sets_data[i] for i in range(N) if key[i] == "1"] + sets_for_difference = [sets_data[i] for i in range(N) if key[i] == "0"] for s in sets_for_intersection: value = value & s for s in sets_for_difference: @@ -80,6 +81,7 @@ def get_labels(data, fill="number"): return labels + # -------------------------------------------------------------------- @@ -94,9 +96,9 @@ def venn2(data, outfile, names=None, fill="number", **kwds): labels = get_labels(data, fill=fill) # set figure size - if 'figsize' in kwds and len(kwds['figsize']) == 2: + if "figsize" in kwds and len(kwds["figsize"]) == 2: # if 'figsize' is in kwds, and it is a list or tuple with length of 2 - figsize = kwds['figsize'] + figsize = kwds["figsize"] else: # default figure size figsize = (8, 8) @@ -109,10 +111,14 @@ def venn2(data, outfile, names=None, fill="number", **kwds): ax.set_ylim(0, 8) # set colors for different Circles or ellipses - if 'colors' in kwds and isinstance(kwds['colors'], Iterable) and len(kwds['colors']) >= 2: - colors = kwds['colors'] + if ( + "colors" in kwds + and isinstance(kwds["colors"], Iterable) + and len(kwds["colors"]) >= 2 + ): + colors = kwds["colors"] else: - colors = ['red', 'green'] + colors = ["red", "green"] c1 = Circle((3.0, 4.0), radius=2.0, alpha=0.5, color=colors[0]) c2 = Circle((5.0, 4.0), radius=2.0, alpha=0.5, color=colors[1]) @@ -122,19 +128,21 @@ def venn2(data, outfile, names=None, fill="number", **kwds): # draw text # 1 - pylab.text(2.0, 4.0, labels['10'], **alignment) - pylab.text(2.5, 4.0, labels['01'], **alignment) + pylab.text(2.0, 4.0, labels["10"], **alignment) + pylab.text(2.5, 4.0, labels["01"], **alignment) # 2 - pylab.text(4.0, 4.0, labels['11'], **alignment) + pylab.text(4.0, 4.0, labels["11"], **alignment) # names of different groups if outfile: - pylab.text(3.0, 4.0-1.2*2.0, names[0], fontsize=16, **alignment) - pylab.text(5.0, 4.0-1.2*2.0, names[1], fontsize=16, **alignment) + pylab.text(3.0, 4.0 - 1.2 * 2.0, names[0], fontsize=16, **alignment) + pylab.text(5.0, 4.0 - 1.2 * 2.0, names[1], fontsize=16, **alignment) - leg = ax.legend(names, loc='best', fancybox=True) + leg = ax.legend(names, loc="best", fancybox=True) leg.get_frame().set_alpha(0.5) pylab.savefig(outfile) + + # -------------------------------------------------------------------- @@ -148,15 +156,15 @@ def venn3(data, outfile, names=None, fill="number", **kwds): labels = get_labels(data, fill=fill) # set figure size - if 'figsize' in kwds and len(kwds['figsize']) == 2: + if "figsize" in kwds and len(kwds["figsize"]) == 2: # if 'figsize' is in kwds, and it is a list or tuple with length of 2 - figsize = kwds['figsize'] + figsize = kwds["figsize"] else: # default figure size figsize = (10, 10) - fig = pylab.figure(figsize=figsize) # set figure size + fig = pylab.figure(figsize=figsize) # set figure size ax = fig.gca() - ax.set_aspect("equal") # set aspect ratio to 1 + ax.set_aspect("equal") # set aspect ratio to 1 ax.set_xticks([]) ax.set_yticks([]) ax.set_xlim(0, 8) @@ -164,7 +172,7 @@ def venn3(data, outfile, names=None, fill="number", **kwds): # set colors for different Circles or ellipses - colors = ['red', 'green', 'blue'] + colors = ["red", "green", "blue"] c1 = Circle((3.0, 3.0), radius=2.0, alpha=0.5, color=colors[0]) c2 = Circle((5.0, 3.0), radius=2.0, alpha=0.5, color=colors[1]) @@ -174,22 +182,22 @@ def venn3(data, outfile, names=None, fill="number", **kwds): # draw text # 1 - pylab.text(3.0-2/2, 3.0-2/2, labels['100'], **alignment) - pylab.text(5.0+2/2, 3.0-2/2, labels['010'], **alignment) - pylab.text((3.0+5.0)/2, 4.73+2/2, labels['001'], **alignment) + pylab.text(3.0 - 2 / 2, 3.0 - 2 / 2, labels["100"], **alignment) + pylab.text(5.0 + 2 / 2, 3.0 - 2 / 2, labels["010"], **alignment) + pylab.text((3.0 + 5.0) / 2, 4.73 + 2 / 2, labels["001"], **alignment) # 2 - pylab.text((3.0+5.0)/2, 3.0-2/2, labels['110'], **alignment) - pylab.text(3.0, 3.0+2*2/3, labels['101'], **alignment) - pylab.text(5.0, 3.0+2*2/3, labels['011'], **alignment) + pylab.text((3.0 + 5.0) / 2, 3.0 - 2 / 2, labels["110"], **alignment) + pylab.text(3.0, 3.0 + 2 * 2 / 3, labels["101"], **alignment) + pylab.text(5.0, 3.0 + 2 * 2 / 3, labels["011"], **alignment) # 3 - pylab.text((3.0+5.0)/2, 3.0+2/3, labels['111'], **alignment) + pylab.text((3.0 + 5.0) / 2, 3.0 + 2 / 3, labels["111"], **alignment) # names of different groups if outfile: pylab.text(1.0, 1.0, names[0], fontsize=16, **alignment) pylab.text(7.0, 1.0, names[1], fontsize=16, **alignment) - pylab.text(4.0, 4.73+1.2*2, names[2], fontsize=16, **alignment) + pylab.text(4.0, 4.73 + 1.2 * 2, names[2], fontsize=16, **alignment) - leg = ax.legend(names, loc='best', fancybox=True) + leg = ax.legend(names, loc="best", fancybox=True) leg.get_frame().set_alpha(0.5) pylab.savefig(outfile) @@ -203,20 +211,24 @@ def venn4(data, outfile, names=None, fill="number", **kwds): labels = get_labels(data, fill=fill) # set figure size - if 'figsize' in kwds and len(kwds['figsize']) == 2: + if "figsize" in kwds and len(kwds["figsize"]) == 2: # if 'figsize' is in kwds, and it is a list or tuple with length of 2 - figsize = kwds['figsize'] + figsize = kwds["figsize"] else: # default figure size figsize = (10, 10) # set colors for different Circles or ellipses - if 'colors' in kwds and isinstance(kwds['colors'], Iterable) and len(kwds['colors']) >= 4: - colors = kwds['colors'] + if ( + "colors" in kwds + and isinstance(kwds["colors"], Iterable) + and len(kwds["colors"]) >= 4 + ): + colors = kwds["colors"] else: - colors = ['r', 'g', 'b', 'c'] + colors = ["r", "g", "b", "c"] # draw ellipse, the coordinates are hard coded in the rest of the function - fig = pylab.figure(figsize=figsize) # set figure size + fig = pylab.figure(figsize=figsize) # set figure size ax = fig.gca() patches = [] width, height = 170, 110 # width and height of the ellipses @@ -234,24 +246,24 @@ def venn4(data, outfile, names=None, fill="number", **kwds): # draw text # 1 - pylab.text(120, 200, labels['1000'], **alignment) - pylab.text(280, 200, labels['0100'], **alignment) - pylab.text(155, 250, labels['0010'], **alignment) - pylab.text(245, 250, labels['0001'], **alignment) + pylab.text(120, 200, labels["1000"], **alignment) + pylab.text(280, 200, labels["0100"], **alignment) + pylab.text(155, 250, labels["0010"], **alignment) + pylab.text(245, 250, labels["0001"], **alignment) # 2 - pylab.text(200, 115, labels['1100'], **alignment) - pylab.text(140, 225, labels['1010'], **alignment) - pylab.text(145, 155, labels['1001'], **alignment) - pylab.text(255, 155, labels['0110'], **alignment) - pylab.text(260, 225, labels['0101'], **alignment) - pylab.text(200, 240, labels['0011'], **alignment) + pylab.text(200, 115, labels["1100"], **alignment) + pylab.text(140, 225, labels["1010"], **alignment) + pylab.text(145, 155, labels["1001"], **alignment) + pylab.text(255, 155, labels["0110"], **alignment) + pylab.text(260, 225, labels["0101"], **alignment) + pylab.text(200, 240, labels["0011"], **alignment) # 3 - pylab.text(235, 205, labels['0111'], **alignment) - pylab.text(165, 205, labels['1011'], **alignment) - pylab.text(225, 135, labels['1101'], **alignment) - pylab.text(175, 135, labels['1110'], **alignment) + pylab.text(235, 205, labels["0111"], **alignment) + pylab.text(165, 205, labels["1011"], **alignment) + pylab.text(225, 135, labels["1101"], **alignment) + pylab.text(175, 135, labels["1110"], **alignment) # 4 - pylab.text(200, 175, labels['1111'], **alignment) + pylab.text(200, 175, labels["1111"], **alignment) # names of different groups if outfile: pylab.text(110, 110, names[0], fontsize=16, **alignment) @@ -259,7 +271,7 @@ def venn4(data, outfile, names=None, fill="number", **kwds): pylab.text(130, 275, names[2], fontsize=16, **alignment) pylab.text(270, 275, names[3], fontsize=16, **alignment) - leg = ax.legend(names, loc='best', fancybox=True) + leg = ax.legend(names, loc="best", fancybox=True) leg.get_frame().set_alpha(0.5) pylab.savefig(outfile) From 043abb5584981e10b71cd7ea116f41b10752eeb3 Mon Sep 17 00:00:00 2001 From: meesters Date: Wed, 6 Aug 2025 13:55:03 +0200 Subject: [PATCH 2/3] refactor: Snakefile --- workflow/Snakefile | 141 ++++++++++++++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 39 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index ecdaa75..4287547 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -3,19 +3,19 @@ from os import path from snakemake.utils import min_version -min_version("7.19.1") # this is where SLURM support was introduced +min_version("7.19.1") # this is where SLURM support was introduced -INPUT_DIR=config["INPUT_DIR"] +INPUT_DIR = config["INPUT_DIR"] -MIN_DIR=config["PREPARED_LIGAND_DIR"] +MIN_DIR = config["PREPARED_LIGAND_DIR"] -PREPARED_DIR=config["PREPARED_DATA_DIR"] +PREPARED_DIR = config["PREPARED_DATA_DIR"] -OUTPUT_DIR=config["OUTPUT_DIR"] +OUTPUT_DIR = config["OUTPUT_DIR"] -TMP_DIR=config["TEMP_DATA_DIR"] +TMP_DIR = config["TEMP_DATA_DIR"] -LOCAL_INPUT =config["LOCAL_INPUT_DIR"] +LOCAL_INPUT = config["LOCAL_INPUT_DIR"] DATABASE = config["DATABASE"] @@ -23,77 +23,140 @@ SUBSET = config["SUBSET"] RESCREENING_TARGETS = config["RESCREENING_TARGETS"] + def generateOutput(wildcards): irods = path.join(OUTPUT_DIR, "results", "irods.json") if config["RESCREENING"] == "TRUE": - out = expand(path.join(OUTPUT_DIR, "results", "rescreening_{percentage}", "{receptorID}", "union.csv"), - receptorID = config["TARGETS"][0].split(',')[0], - percentage = config["RESULT_NUMBER"], - combAll = combAll) - hist = expand(path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), - receptorID = config["TARGETS"][0].split(',')[0]) + out = expand( + path.join( + OUTPUT_DIR, + "results", + "rescreening_{percentage}", + "{receptorID}", + "union.csv", + ), + receptorID=config["TARGETS"][0].split(",")[0], + percentage=config["RESULT_NUMBER"], + combAll=combAll, + ) + hist = expand( + path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), + receptorID=config["TARGETS"][0].split(",")[0], + ) - return hist + out + [irods] + return hist + out + [irods] else: - out = expand(path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"), - receptorID = config["TARGETS"][0].split(',')[0], - percentage = config["RESULT_NUMBER"]) - hist = expand(path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), - receptorID = config["TARGETS"][0].split(',')[0]) + out = expand( + path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"), + receptorID=config["TARGETS"][0].split(",")[0], + percentage=config["RESULT_NUMBER"], + ) + hist = expand( + path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), + receptorID=config["TARGETS"][0].split(",")[0], + ) return hist + out + [irods] -localrules: all, generateIRODS, dockingResultsTxt, removeDuplicateLigands, makeVenn, prepareLigands2, mergeDocking2, bestLigands, prepareSecondDocking, convertMol2, makeReceptorPDBQT, mergeDocking ,mergeLocalInput, split, split2, targetProtein, getZINCdata, getZINCSubsets, gunzip, ENAMINEdownload, prepareReceptor, prepareDocking, prepareLibrary, prepareGeometry, makeHistogram, cleanLigands - -targetList=[] #get ProteinIDs from configfile for rescreening +localrules: + all, + generateIRODS, + dockingResultsTxt, + removeDuplicateLigands, + makeVenn, + prepareLigands2, + mergeDocking2, + bestLigands, + prepareSecondDocking, + convertMol2, + makeReceptorPDBQT, + mergeDocking, + mergeLocalInput, + split, + split2, + targetProtein, + getZINCdata, + getZINCSubsets, + gunzip, + ENAMINEdownload, + prepareReceptor, + prepareDocking, + prepareLibrary, + prepareGeometry, + makeHistogram, + cleanLigands, + + +targetList = [] # get ProteinIDs from configfile for rescreening for i in config["RESCREENING_TARGETS"]: - targetList.append(i.split(',')[0]) + targetList.append(i.split(",")[0]) combList = [] for comb in itertools.combinations(targetList, 2): # all combinations of two targets - combList.append('_'.join(comb)) + combList.append("_".join(comb)) + +combAll = "_".join(targetList) # combine all rescreening targets -combAll = '_'.join(targetList) # combine all rescreening targets def getAllVenn(wildcards): - path.join(OUTPUT_DIR, "output", "rescreening", "{receptorID}", "{RESCREENING_TARGETS}_union.txt") + path.join( + OUTPUT_DIR, + "output", + "rescreening", + "{receptorID}", + "{RESCREENING_TARGETS}_union.txt", + ) def IRODSinput(wildcards): if config["RESCREENING"] == "TRUE": - out = expand(path.join(OUTPUT_DIR, "results", "rescreening_{percentage}", "{receptorID}", "union.csv"), - receptorID = config["TARGETS"][0].split(',')[0], - percentage = config["RESULT_NUMBER"], - combAll = combAll) + out = expand( + path.join( + OUTPUT_DIR, + "results", + "rescreening_{percentage}", + "{receptorID}", + "union.csv", + ), + receptorID=config["TARGETS"][0].split(",")[0], + percentage=config["RESULT_NUMBER"], + combAll=combAll, + ) else: - out = expand(path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"), - receptorID = config["TARGETS"][0].split(',')[0], - percentage = config["RESULT_NUMBER"]) + out = expand( + path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"), + receptorID=config["TARGETS"][0].split(",")[0], + percentage=config["RESULT_NUMBER"], + ) return out + def resources(RULE_NAME): out_list = [] for key in config[RULE_NAME]: - out_list.append(f'{key} = {config[RULE_NAME][key]}') - out_str = ',\n'.join(out_list) + out_list.append(f"{key} = {config[RULE_NAME][key]}") + out_str = ",\n".join(out_list) return out_str + rule all: input: - generateOutput + generateOutput, + rule generateIRODS: input: - IRODSinput + IRODSinput, output: - path.join(OUTPUT_DIR, "results", "irods.json") + path.join(OUTPUT_DIR, "results", "irods.json"), log: - "logs/generateIRODS.log" + "logs/generateIRODS.log", script: "scripts/generateIRODS.py" + include: "rules/analyse.smk" include: "rules/docking.smk" include: "rules/preparation.smk" From 8834537c86a3e039b4ff975a525ffc294f7ce3ea Mon Sep 17 00:00:00 2001 From: meesters Date: Wed, 6 Aug 2025 13:56:21 +0200 Subject: [PATCH 3/3] refactor: refactored all rule files --- workflow/rules/analyse.smk | 353 +++++++++++++++++++++++---------- workflow/rules/docking.smk | 67 ++++--- workflow/rules/preparation.smk | 111 +++++++---- 3 files changed, 362 insertions(+), 169 deletions(-) diff --git a/workflow/rules/analyse.smk b/workflow/rules/analyse.smk index 61aae9b..eb7b05c 100644 --- a/workflow/rules/analyse.smk +++ b/workflow/rules/analyse.smk @@ -4,6 +4,7 @@ import os import requests from snakemake.logging import logger + def url_reachable(url): """ test for reachable URL @@ -11,30 +12,40 @@ def url_reachable(url): r = requests.head(url) return r.status_code == 200 + def getTranches(): - '''return traches from parsing last log file''' - log_files = glob.glob('.snakemake/log/*') + """return traches from parsing last log file""" + log_files = glob.glob(".snakemake/log/*") sorted_files = sorted(log_files, key=os.path.getmtime) tranch_list = [] - pattern = '[A-K][A-K][ABCEGI][ABCDEF][RMLH][NMLOP]' + pattern = "[A-K][A-K][ABCEGI][ABCDEF][RMLH][NMLOP]" with open(sorted_files[-2]) as log: matches = re.findall(pattern, log.read()) return list(set(matches)) + def library(wildcards): - if DATABASE[0]=="ZINC": # ZINC database selected + if DATABASE[0] == "ZINC": # ZINC database selected - if SUBSET=="TRANCHES": # Tranches selected + if SUBSET == "TRANCHES": # Tranches selected out = [] # test for ZINC reachability: if not url_reachable("http://files.docking.org/3D/"): - logger.info("The ZINC database is not accessible right now. Perhaps it is temporarily down?") - user_input = input("Have you already run this workflow in the current folder with the same input data?(y/n) \n") + logger.info( + "The ZINC database is not accessible right now. Perhaps it is temporarily down?" + ) + user_input = input( + "Have you already run this workflow in the current folder with the same input data?(y/n) \n" + ) if user_input == "y": - logger.info("Trying to proceed without accessing the ZINC database (http://files.docking.org)") - tranch_list = getTranches() #get list with 6 letter code specifing ZINC tranches + logger.info( + "Trying to proceed without accessing the ZINC database (http://files.docking.org)" + ) + tranch_list = ( + getTranches() + ) # get list with 6 letter code specifing ZINC tranches - receptorID = config["TARGETS"][0].split(',')[0] + receptorID = config["TARGETS"][0].split(",")[0] database = config["DATABASE"] for i in tranch_list: @@ -47,185 +58,264 @@ def library(wildcards): else: return out else: - logger.error("Aborting the snakemake run for now, as the data from the ZINC database (http://files.docking.org) are temporarily not available. Please try at a later time.") + logger.error( + "Aborting the snakemake run for now, as the data from the ZINC database (http://files.docking.org) are temporarily not available. Please try at a later time." + ) sys.exit(1) - rawOut= expand(path.join(OUTPUT_DIR, "output", "{receptorID}", - "{receptorID}_{database}_{w}{l}_{w}{l}{r}{p}{ph}{c}.pdbqt.gz"), - receptorID = config["TARGETS"][0].split(',')[0], - database = config["DATABASE"], - w = config["ZINC_INPUT"]["WEIGHT"], - l = config["ZINC_INPUT"]["LOGP"], - r = config["ZINC_INPUT"]["REACT"], - p = config["ZINC_INPUT"]["PURCHASE"], - ph = config["ZINC_INPUT"]["PH"], - c = config["ZINC_INPUT"]["CHARGE"], - dataset = (config["ZINC_INPUT"]["WEIGHT"]) + (config["ZINC_INPUT"]["LOGP"])) + rawOut = expand( + path.join( + OUTPUT_DIR, + "output", + "{receptorID}", + "{receptorID}_{database}_{w}{l}_{w}{l}{r}{p}{ph}{c}.pdbqt.gz", + ), + receptorID=config["TARGETS"][0].split(",")[0], + database=config["DATABASE"], + w=config["ZINC_INPUT"]["WEIGHT"], + l=config["ZINC_INPUT"]["LOGP"], + r=config["ZINC_INPUT"]["REACT"], + p=config["ZINC_INPUT"]["PURCHASE"], + ph=config["ZINC_INPUT"]["PH"], + c=config["ZINC_INPUT"]["CHARGE"], + dataset=(config["ZINC_INPUT"]["WEIGHT"]) + + (config["ZINC_INPUT"]["LOGP"]), + ) for i in rawOut: weighLog = i.split("_")[-2] restAttr = (i.split("_")[-1])[2:6] - url = "".join(['http://files.docking.org/3D/', weighLog,'/', restAttr, '/', weighLog,restAttr,'.xaa.pdbqt.gz']) + url = "".join( + [ + "http://files.docking.org/3D/", + weighLog, + "/", + restAttr, + "/", + weighLog, + restAttr, + ".xaa.pdbqt.gz", + ] + ) r = requests.get(url) if r.status_code == 200: out.append(i) if not out: - logger.error("All selected tranches are empty; select other parameters!") + logger.error( + "All selected tranches are empty; select other parameters!" + ) sys.exit(1) else: return out - else: # if not tranches select subset + else: # if not tranches select subset - out = expand(path.join(OUTPUT_DIR, "output", "{receptorID}", - "{receptorID}_{database}_subsets_{subset}.pdbqt.gz"), - database = config["DATABASE"], - subset = config["SUBSET"], - receptorID = config["TARGETS"][0].split(',')[0]) + out = expand( + path.join( + OUTPUT_DIR, + "output", + "{receptorID}", + "{receptorID}_{database}_subsets_{subset}.pdbqt.gz", + ), + database=config["DATABASE"], + subset=config["SUBSET"], + receptorID=config["TARGETS"][0].split(",")[0], + ) - url = "https://zinc15.docking.org/substances/subsets/" + SUBSET + ".mol2?count=all" + url = ( + "https://zinc15.docking.org/substances/subsets/" + + SUBSET + + ".mol2?count=all" + ) r = requests.get(url) - if r.status_code == 200: #test if subset is valid + if r.status_code == 200: # test if subset is valid return out r_zinc = requests.get("https://zinc15.docking.org/") - if r_zinc.status_code != 200: #test if ZINC database is available - logger.info("The ZINC database is not accessible right now. Perhaps it is temporarily down?") - #if ZINC not available, but dataset is already downloaded --> continue - subset_dir = path.join(INPUT_DIR,config["SUBSET"]+'.mol2') + if r_zinc.status_code != 200: # test if ZINC database is available + logger.info( + "The ZINC database is not accessible right now. Perhaps it is temporarily down?" + ) + # if ZINC not available, but dataset is already downloaded --> continue + subset_dir = path.join(INPUT_DIR, config["SUBSET"] + ".mol2") if os.path.isfile(subset_dir): return out else: - logger.error("Subset is not availiable in the specified data folder. \n Abort snakemake run, try again later") + logger.error( + "Subset is not availiable in the specified data folder. \n Abort snakemake run, try again later" + ) sys.exit(1) else: - logger.error( "Invalid subset name!") + logger.error("Invalid subset name!") sys.exit(1) - else: # not ZINC database --> local input data - best = expand(path.join(OUTPUT_DIR, "output", "{receptorID}", - "{receptorID}_{database}_{dataset}_local.pdbqt.gz"), - database = config["DATABASE"], - dataset = config["LOC_DATA"], - receptorID = config["TARGETS"][0].split(',')[0]) + else: # not ZINC database --> local input data + best = expand( + path.join( + OUTPUT_DIR, + "output", + "{receptorID}", + "{receptorID}_{database}_{dataset}_local.pdbqt.gz", + ), + database=config["DATABASE"], + dataset=config["LOC_DATA"], + receptorID=config["TARGETS"][0].split(",")[0], + ) return best + rule makeHistogram: input: - path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz") + path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"), output: - report(path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), category="Histogram") + report( + path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), + category="Histogram", + ), log: - "logs/makeHistogram_{receptorID}.log" + "logs/makeHistogram_{receptorID}.log", envmodules: - config["PYPLOT"] + config["PYPLOT"], script: "../scripts/makeHistogram.py" + rule bestLigands: input: - library + library, output: - path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz") + path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"), log: - "logs/bestLigands_{receptorID}.log" + "logs/bestLigands_{receptorID}.log", script: "../scripts/mergeOutput.py" + rule dockingResults: input: - path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz") + path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"), output: - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt") + path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), envmodules: - config["PYTHON"] + config["PYTHON"], log: - "logs/dockingResults_{receptorID}_{percentage}.log" + "logs/dockingResults_{receptorID}_{percentage}.log", threads: config["DOCKING_RESULTS"]["threads"] resources: mem_mb=config["DOCKING_RESULTS"]["mem_mb"], runtime=config["DOCKING_RESULTS"]["runtime"], - partition=config["DOCKING_RESULTS"]["partition"] + partition=config["DOCKING_RESULTS"]["partition"], script: "../scripts/sortResult.py" + rule dockingResultsTxt: input: - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt") + path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), output: - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv") + path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"), log: - "logs/dockingResultsTxt_{receptorID}_{percentage}.log" + "logs/dockingResultsTxt_{receptorID}_{percentage}.log", wildcard_constraints: receptorID="[^/]+", - percentage="[^/]+" + percentage="[^/]+", script: "../scripts/ResultTxt.py" + rule removeDuplicateLigands: input: - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt") + path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), output: - path.join(OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt") + path.join( + OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt" + ), log: - "logs/removeDuplicateLigands_{receptorID}_{percentage}.log" + "logs/removeDuplicateLigands_{receptorID}_{percentage}.log", shell: "sed '/MODEL [2-9]/,/ENDMDL/d' {input} > {output}" + checkpoint split2: input: - path.join(OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt") + path.join( + OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt" + ), output: - directory(os.path.join(TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}")) + directory( + os.path.join(TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}") + ), log: - "logs/split2_{receptorID}_{percentage}.log" + "logs/split2_{receptorID}_{percentage}.log", script: "../scripts/splitFile.py" + rule prepareLigands2: input: - ligands = path.join(TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}","{i}.pdbqt") + ligands=path.join( + TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}", "{i}.pdbqt" + ), output: - ligands = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt") + ligands=path.join( + OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt" + ), log: - "logs/prepareLigands2_{receptorID}_{percentage}_{name}_{i}.log" + "logs/prepareLigands2_{receptorID}_{percentage}_{name}_{i}.log", shell: "echo {input.ligands} > {output.ligands}" + rule prepareSecondDocking: input: - grid = path.join(OUTPUT_DIR,"grid","{name}_grid.txt"), - receptor = path.join(PREPARED_DIR,"receptor", "{name}.pdbqt") + grid=path.join(OUTPUT_DIR, "grid", "{name}_grid.txt"), + receptor=path.join(PREPARED_DIR, "receptor", "{name}.pdbqt"), output: - grid = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd"), - receptor = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec") + grid=path.join( + OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd" + ), + receptor=path.join( + OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec" + ), log: - "logs/prepareSecondDocking_{name}_{receptorID}_{percentage}.log" + "logs/prepareSecondDocking_{name}_{receptorID}_{percentage}.log", shell: """ cp {input.grid} {output.grid} echo {input.receptor} > {output.receptor} """ + rule docking2: input: - ligands = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt"), - grid = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd"), - receptor = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec") + ligands=path.join( + OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt" + ), + grid=path.join( + OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd" + ), + receptor=path.join( + OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec" + ), output: - path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}","{name}.rec_{i}.txt.pdbqt.gz") + path.join( + OUTPUT_DIR, + "rescreening_{percentage}", + "{name}_{receptorID}", + "{name}.rec_{i}.txt.pdbqt.gz", + ), log: - "logs/docking2_{name}_{receptorID}_{percentage}_{i}.log" + "logs/docking2_{name}_{receptorID}_{percentage}_{i}.log", params: - dir = path.join(OUTPUT_DIR, "rescreening_{percentage}"), - gridfile = path.join(config["GRID_DIR"], "{name}.gpf"), - cutOff = config["CUTOFF_VALUE"] + dir=path.join(OUTPUT_DIR, "rescreening_{percentage}"), + gridfile=path.join(config["GRID_DIR"], "{name}.gpf"), + cutOff=config["CUTOFF_VALUE"], resources: - mpi = True, + mpi=True, partition=config["DOCKING"]["partition"], tasks=config["DOCKING"]["ntasks"], slurm_extra=config["DOCKING"]["slurm_extra"], - runtime=config["DOCKING"]["runtime"] + runtime=config["DOCKING"]["runtime"], envmodules: - config["VINALC"] + config["VINALC"], shell: """( cd {params.dir}/{wildcards.name}_{wildcards.receptorID} @@ -235,47 +325,102 @@ rule docking2: cd - )""" + def aggregate_in2(wildcards): checkpoint_output = checkpoints.split2.get(**wildcards).output[0] - files_names = expand(path.join(OUTPUT_DIR, "rescreening_{{percentage}}", "{{name}}_{{receptorID}}","{{name}}.rec_{i}.txt.pdbqt.gz"), - i = glob_wildcards(os.path.join(checkpoint_output, '{i}.pdbqt')).i) + files_names = expand( + path.join( + OUTPUT_DIR, + "rescreening_{{percentage}}", + "{{name}}_{{receptorID}}", + "{{name}}.rec_{i}.txt.pdbqt.gz", + ), + i=glob_wildcards(os.path.join(checkpoint_output, "{i}.pdbqt")).i, + ) return files_names + rule mergeDocking2: input: - unpack(aggregate_in2) + unpack(aggregate_in2), output: - path.join(OUTPUT_DIR, "output", "rescreening_{percentage}", "{name}_{receptorID}","{name}.pdbqt.gz") + path.join( + OUTPUT_DIR, + "output", + "rescreening_{percentage}", + "{name}_{receptorID}", + "{name}.pdbqt.gz", + ), log: - "logs/mergeDocking2_{name}_{receptorID}_{percentage}.log" + "logs/mergeDocking2_{name}_{receptorID}_{percentage}.log", shell: - ''' + """ cat {input} >> {output} - ''' + """ + rule dockingResults2: input: - path.join(OUTPUT_DIR, "output", "rescreening_{percentage}","{name}_{receptorID}","{name}.pdbqt.gz") + path.join( + OUTPUT_DIR, + "output", + "rescreening_{percentage}", + "{name}_{receptorID}", + "{name}.pdbqt.gz", + ), output: - path.join(OUTPUT_DIR,"output","rescreening_{percentage}","{name}_{receptorID}","{name}_best.pdbqt") + path.join( + OUTPUT_DIR, + "output", + "rescreening_{percentage}", + "{name}_{receptorID}", + "{name}_best.pdbqt", + ), envmodules: - config["PYTHON"] + config["PYTHON"], log: - "logs/dockingResults2_{name}_{receptorID}_{percentage}.log" + "logs/dockingResults2_{name}_{receptorID}_{percentage}.log", script: "../scripts/sortResult.py" + rule makeVenn: input: - best = path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), - re_results = expand(path.join(OUTPUT_DIR,"output","rescreening_{percentage}","{name}_{receptorID}","{name}_best.pdbqt"), - percentage = config["RESULT_NUMBER"], - receptorID = config["TARGETS"][0].split(',')[0], - name = targetList) + best=path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), + re_results=expand( + path.join( + OUTPUT_DIR, + "output", + "rescreening_{percentage}", + "{name}_{receptorID}", + "{name}_best.pdbqt", + ), + percentage=config["RESULT_NUMBER"], + receptorID=config["TARGETS"][0].split(",")[0], + name=targetList, + ), output: - report(path.join(OUTPUT_DIR,"results","rescreening_{percentage}","{receptorID}","union.csv"), category="Rescreening"), - report(path.join(OUTPUT_DIR,"results","rescreening_{percentage}","{receptorID}","venn.png"), category="Rescreening") + report( + path.join( + OUTPUT_DIR, + "results", + "rescreening_{percentage}", + "{receptorID}", + "union.csv", + ), + category="Rescreening", + ), + report( + path.join( + OUTPUT_DIR, + "results", + "rescreening_{percentage}", + "{receptorID}", + "venn.png", + ), + category="Rescreening", + ), log: - "logs/makeVenn_{receptorID}_{percentage}.log" + "logs/makeVenn_{receptorID}_{percentage}.log", script: "../scripts/union_venn.py" diff --git a/workflow/rules/docking.smk b/workflow/rules/docking.smk index 505bc38..c1ed1ac 100644 --- a/workflow/rules/docking.smk +++ b/workflow/rules/docking.smk @@ -1,8 +1,8 @@ def get_spacing(gridfile): - #""" - #function to retrieve the 'spacing' (scale) of a grid file - #""" - #with open(gridfile) as infile: + # """ + # function to retrieve the 'spacing' (scale) of a grid file + # """ + # with open(gridfile) as infile: # for line in gridfile: # match = re.findall("spacing", line) # if match: @@ -12,28 +12,34 @@ def get_spacing(gridfile): # raise WorkflowError("unable to convert spacing to float from grid file: {gridfile}") # return spacing pass - + rule docking: input: - receptor = path.join(OUTPUT_DIR,"receptor","{receptorID}.txt"), - geometry = path.join(OUTPUT_DIR,"grid","{receptorID}_grid.txt"), - ligands = path.join(OUTPUT_DIR,"library","{database}_{dataset}_{name}_{i}.txt") + receptor=path.join(OUTPUT_DIR, "receptor", "{receptorID}.txt"), + geometry=path.join(OUTPUT_DIR, "grid", "{receptorID}_grid.txt"), + ligands=path.join(OUTPUT_DIR, "library", "{database}_{dataset}_{name}_{i}.txt"), output: - path.join(OUTPUT_DIR, "output","{receptorID}","{dataset}","{receptorID}.txt_{database}_{dataset}_{name}_{i}.txt.pdbqt.gz") + path.join( + OUTPUT_DIR, + "output", + "{receptorID}", + "{dataset}", + "{receptorID}.txt_{database}_{dataset}_{name}_{i}.txt.pdbqt.gz", + ), envmodules: - config["VINALC"] + config["VINALC"], params: - dir = path.join(OUTPUT_DIR, "output"), - space = get_spacing(path.join(config["GRID_DIR"], "{receptorID}.gpf")), - errorDir = path.join(OUTPUT_DIR, "errorLigands.txt"), + dir=path.join(OUTPUT_DIR, "output"), + space=get_spacing(path.join(config["GRID_DIR"], "{receptorID}.gpf")), + errorDir=path.join(OUTPUT_DIR, "errorLigands.txt"), resources: - partition = config["DOCKING"]["partition"], - runtime = config["DOCKING"]["runtime"], - slurm_extra = config["DOCKING"]["slurm_extra"], - mpi = "srun", - mem_mb_per_cpu = config["DOCKING"]["mem_mb_per_cpu"], - ntasks = config["DOCKING"]["ntasks"] + partition=config["DOCKING"]["partition"], + runtime=config["DOCKING"]["runtime"], + slurm_extra=config["DOCKING"]["slurm_extra"], + mpi="srun", + mem_mb_per_cpu=config["DOCKING"]["mem_mb_per_cpu"], + ntasks=config["DOCKING"]["ntasks"], shell: """( mkdir -p {params.dir}/{wildcards.receptorID}/{wildcards.dataset} @@ -45,16 +51,31 @@ rule docking: cd - )""" + def aggregate_in(wildcards): checkpoint_output = checkpoints.split.get(**wildcards).output[0] - files_names = expand(path.join(OUTPUT_DIR,"output","{{receptorID}}", "{{dataset}}", "{{receptorID}}.txt_{{database}}_{{dataset}}_{{name}}_{i}.txt.pdbqt.gz"), - i = glob_wildcards(os.path.join(checkpoint_output, "{i}.pdbqt")).i) + files_names = expand( + path.join( + OUTPUT_DIR, + "output", + "{{receptorID}}", + "{{dataset}}", + "{{receptorID}}.txt_{{database}}_{{dataset}}_{{name}}_{i}.txt.pdbqt.gz", + ), + i=glob_wildcards(os.path.join(checkpoint_output, "{i}.pdbqt")).i, + ) return files_names + rule mergeDocking: input: - unpack(aggregate_in) + unpack(aggregate_in), output: - path.join(OUTPUT_DIR, "output", "{receptorID}", "{receptorID}_{database}_{dataset}_{name}.pdbqt.gz") + path.join( + OUTPUT_DIR, + "output", + "{receptorID}", + "{receptorID}_{database}_{dataset}_{name}.pdbqt.gz", + ), script: "../scripts/mergeOutput.py" diff --git a/workflow/rules/preparation.smk b/workflow/rules/preparation.smk index 1e3a3da..303b2b8 100644 --- a/workflow/rules/preparation.smk +++ b/workflow/rules/preparation.smk @@ -1,144 +1,171 @@ ruleorder: convertMol2 > gunzip + rule targetProtein: output: - path.join(INPUT_DIR, "PDB","receptor","{receptorID}.pdb.gz") + path.join(INPUT_DIR, "PDB", "receptor", "{receptorID}.pdb.gz"), shell: "curl -o {output} {config[TARGET_URL]}/{wildcards.receptorID}.pdb.gz" + rule getZINCdata: output: - path.join(INPUT_DIR, "ZINC","{dataset}","{name}.pdbqt.gz") + path.join(INPUT_DIR, "ZINC", "{dataset}", "{name}.pdbqt.gz"), script: "../scripts/ZINCdownload.py" + rule getZINCSubsets: - params: sub = config["SUBSET"] + params: + sub=config["SUBSET"], output: - expand(path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2"), subset=config["SUBSET"]) + expand( + path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2"), + subset=config["SUBSET"], + ), script: "../scripts/downloadZINCsubsets.py" + rule convertMol2: input: - path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2") + path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2"), output: - path.join(TMP_DIR,"unzipped", "ZINC","subsets","{subset}.pdbqt") + path.join(TMP_DIR, "unzipped", "ZINC", "subsets", "{subset}.pdbqt"), envmodules: - config["OPENBABEL"] + config["OPENBABEL"], shell: "obabel {input} -opdbqt -O {output}" + rule mergeLocalInput: - params: in_dir = LOCAL_INPUT + params: + in_dir=LOCAL_INPUT, output: - path.join(TMP_DIR,"unzipped", "{database}","{dataset}","local.pdbqt") + path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "local.pdbqt"), envmodules: - config["OPENBABEL"] + config["OPENBABEL"], script: "../scripts/concatPDBQT.py" + rule ENAMINEdownload: output: - expand(path.join(INPUT_DIR, "ENAMINE", "{ENAMINE_collection}"), ENAMINE_collection=config["ENAMINE_INPUT"]) + expand( + path.join(INPUT_DIR, "ENAMINE", "{ENAMINE_collection}"), + ENAMINE_collection=config["ENAMINE_INPUT"], + ), script: "../scripts/ENAMINEdownload.py" + rule SDFToPDBQT: input: - path.join(TMP_DIR,"unzipped", "{database}","{dataset}","{name}.sdf") + path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.sdf"), output: - path.join(TMP_DIR,"unzipped" ,"{database}","{dataset}","{name}.pdbqt") + path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.pdbqt"), envmodules: - config["OPENBABEL"] + config["OPENBABEL"], shell: "obabel {input} -opdbqt -O {output}" + rule prepareReceptor: input: - path.join(TMP_DIR, "unzipped", "PDB", "receptor", "{name}.pdb") + path.join(TMP_DIR, "unzipped", "PDB", "receptor", "{name}.pdb"), output: - path.join(TMP_DIR ,"PDB", "receptor", "{name}.pdb") + path.join(TMP_DIR, "PDB", "receptor", "{name}.pdb"), envmodules: - config["BIOPYTHON"] + config["BIOPYTHON"], script: "../scripts/prepareReceptor.py" + rule makeReceptorPDBQT: input: - path.join(TMP_DIR, "PDB", "receptor", "{name}.pdb") + path.join(TMP_DIR, "PDB", "receptor", "{name}.pdb"), output: - path.join(PREPARED_DIR,"receptor", "{name}.pdbqt") + path.join(PREPARED_DIR, "receptor", "{name}.pdbqt"), envmodules: - config["OPENBABEL"] + config["OPENBABEL"], shell: "obabel -ipdb {input} -opdbqt -O {output} -xr" rule gunzip: input: - path.join(INPUT_DIR, "{database}","{dataset}","{name}.{filetype}.gz") + path.join(INPUT_DIR, "{database}", "{dataset}", "{name}.{filetype}.gz"), output: - path.join(TMP_DIR,"unzipped", "{database}","{dataset}","{name}.{filetype}") + path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.{filetype}"), shell: "gunzip < {input} > {output} || touch {output}" + rule cleanLigands: input: - path.join(TMP_DIR,"unzipped", "{database}","{dataset}","{name}.pdbqt") + path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.pdbqt"), output: - path.join(TMP_DIR, "cleaned", "{database}","{dataset}","{name}.pdbqt") + path.join(TMP_DIR, "cleaned", "{database}", "{dataset}", "{name}.pdbqt"), shell: "awk '/MODEL/ {{s=1}} s {{a[++c]=$0}} /Si/ {{p=1}} /ENDMDL/ {{if (!p) for (i=1;i<=c;i++) print a[i]; delete a;s=p=c=0;next}} !s' {input} > {output}" + checkpoint split: input: - path.join(TMP_DIR, "cleaned", "{database}","{dataset}","{name}.pdbqt") + path.join(TMP_DIR, "cleaned", "{database}", "{dataset}", "{name}.pdbqt"), output: - directory(os.path.join(TMP_DIR, "prepared", "{database}","{dataset}","{name}_dir")) + directory( + os.path.join(TMP_DIR, "prepared", "{database}", "{dataset}", "{name}_dir") + ), script: "../scripts/splitFile.py" + rule energyMin: input: - path.join(TMP_DIR, "prepared", "{database}","{dataset}","{name}_dir","{i}.pdbqt") + path.join( + TMP_DIR, "prepared", "{database}", "{dataset}", "{name}_dir", "{i}.pdbqt" + ), output: - path.join(MIN_DIR,"{database}","{dataset}","{name}","{i}.pdbqt") + path.join(MIN_DIR, "{database}", "{dataset}", "{name}", "{i}.pdbqt"), params: - algorithm = config["ENERGY_MIN_ALGORITHM"], - steps = config["STEPS"], - forcefield = config["FORCEFIELD"], - convergence = config["CONVERGENCE_CRITERIA"] + algorithm=config["ENERGY_MIN_ALGORITHM"], + steps=config["STEPS"], + forcefield=config["FORCEFIELD"], + convergence=config["CONVERGENCE_CRITERIA"], threads: config["ENERGY_MIN"]["threads"] resources: - partition = config["ENERGY_MIN"]["partition"], - runtime = config["ENERGY_MIN"]["runtime"], - mem_mb = config["ENERGY_MIN"]["mem_mb"], + partition=config["ENERGY_MIN"]["partition"], + runtime=config["ENERGY_MIN"]["runtime"], + mem_mb=config["ENERGY_MIN"]["mem_mb"], envmodules: - config["OPENBABEL"] + config["OPENBABEL"], shell: "obabel -ipdbqt {input} -opdbqt -O {output} --minimize --{params.algorithm} --steps {params.steps} --ff {params.forcefield} --crit {params.convergence}" + rule prepareGeometry: input: - path.join(config["GRID_DIR"],"{receptorID}.gpf") + path.join(config["GRID_DIR"], "{receptorID}.gpf"), output: - path.join(OUTPUT_DIR,"grid","{receptorID}_grid.txt") + path.join(OUTPUT_DIR, "grid", "{receptorID}_grid.txt"), shell: "egrep 'npts|gridcenter' {input} |cut -f2-4 -d' '| tac |tr '\n' ' ' > {output} && sed -i -e '$a\ ' {output}" + rule prepareLibrary: input: - path.join(MIN_DIR,"{database}","{dataset}","{name}","{i}.pdbqt") + path.join(MIN_DIR, "{database}", "{dataset}", "{name}", "{i}.pdbqt"), output: - library=path.join(OUTPUT_DIR,"library","{database}_{dataset}_{name}_{i}.txt") + library=path.join(OUTPUT_DIR, "library", "{database}_{dataset}_{name}_{i}.txt"), shell: "echo {input} > {output}" + rule prepareDocking: input: - path.join(PREPARED_DIR, "receptor", "{receptorID}.pdbqt") + path.join(PREPARED_DIR, "receptor", "{receptorID}.pdbqt"), output: - path.join(OUTPUT_DIR,"receptor","{receptorID}.txt") + path.join(OUTPUT_DIR, "receptor", "{receptorID}.txt"), shell: "echo {input} > {output}"