From f85ac427038ea18678d564ae27bf9f5c9017bd19 Mon Sep 17 00:00:00 2001
From: meesters <meesters@uni-mainz.de>
Date: Wed, 6 Aug 2025 13:54:10 +0200
Subject: [PATCH 1/3] refactor: all scripts

---
 workflow/scripts/ResultTxt.py           |  23 +++--
 workflow/scripts/ZINCdownload.py        |   1 +
 workflow/scripts/concatPDBQT.py         |   1 +
 workflow/scripts/downloadZINCsubsets.py |   6 +-
 workflow/scripts/generateIRODS.py       |   6 +-
 workflow/scripts/makeHistogram.py       |  13 ++-
 workflow/scripts/mergeOutput.py         |  15 +--
 workflow/scripts/prepareDocking.py      |  11 ++-
 workflow/scripts/prepareReceptor.py     |  14 +--
 workflow/scripts/sortResult.py          |  66 +++++++------
 workflow/scripts/splitFile.py           |  10 +-
 workflow/scripts/union_venn.py          |  73 ++++++++------
 workflow/scripts/venn.py                | 124 +++++++++++++-----------
 13 files changed, 206 insertions(+), 157 deletions(-)
diff --git a/workflow/scripts/ResultTxt.py b/workflow/scripts/ResultTxt.py
index 2d30e7b..981b168 100644
--- a/workflow/scripts/ResultTxt.py
+++ b/workflow/scripts/ResultTxt.py
@@ -5,11 +5,12 @@
 import os
 import pandas as pd
 
-infile  = snakemake.input[0]
+infile = snakemake.input[0]
 outfile = snakemake.output[0]
 
 linesep = os.linesep
 
+
 def pairwise(iterable):
     """
     :param iterable:
@@ -19,28 +20,29 @@ def pairwise(iterable):
     next(b, None)
     return zip(a, b)
 
+
 def extract_pdbqt(infile):
     """
     :param infile: path to infile
     :param outfile: path to outfile
     :return: receptor name, IDs and enthalpies
     """
-    target_IDs  = []
-    enthalpies  = []
-    references  = set()
-    with open(infile, encoding='utf-8') as to_parse:
+    target_IDs = []
+    enthalpies = []
+    references = set()
+    with open(infile, encoding="utf-8") as to_parse:
         # this line reads: 'REMARK RECEPTOR path/to/target.pdbqt'
         headerline = to_parse.readline()
         # we want to extract the target name WITHOUT the suffix
-        receptor   = Path(headerline.split(' ')[-1]).stem
+        receptor = Path(headerline.split(" ")[-1]).stem
         # we then proceed extracting the enthalpy per ligand
         # we iterate two line per iteration, because the file is structured, like:
         #
         # REMARK VINA RESULT:  Enthalpy 1 Enthalpy 2 Enthalpy 3
         # REMARK  Name = <name>
         for i, j in pairwise(to_parse):
-            if 'Name' in j:
-                ID = j.split(' = ')[-1].strip(linesep)
+            if "Name" in j:
+                ID = j.split(" = ")[-1].strip(linesep)
                 if ID not in references:
                     references.add(ID)
                     target_IDs.append(ID)
@@ -48,9 +50,10 @@ def extract_pdbqt(infile):
 
     return receptor, target_IDs, enthalpies
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     receptor, target_IDs, enthalpies = extract_pdbqt(infile)
     # putting into data frame:
-    out_df = pd.DataFrame(enthalpies, index = target_IDs, columns = [receptor])
+    out_df = pd.DataFrame(enthalpies, index=target_IDs, columns=[receptor])
     # finally, save the output
     out_df.to_csv(outfile)
diff --git a/workflow/scripts/ZINCdownload.py b/workflow/scripts/ZINCdownload.py
index 4410ca0..455e420 100644
--- a/workflow/scripts/ZINCdownload.py
+++ b/workflow/scripts/ZINCdownload.py
@@ -1,4 +1,5 @@
 """download ligands in pdbqt format from ZINC database"""
+
 import os
 import requests
 
diff --git a/workflow/scripts/concatPDBQT.py b/workflow/scripts/concatPDBQT.py
index f7286de..6d32adc 100644
--- a/workflow/scripts/concatPDBQT.py
+++ b/workflow/scripts/concatPDBQT.py
@@ -1,4 +1,5 @@
 """read all files in input folder and concat them together"""
+
 import os
 from snakemake.shell import shell
 
diff --git a/workflow/scripts/downloadZINCsubsets.py b/workflow/scripts/downloadZINCsubsets.py
index 2b650ba..ac8e872 100644
--- a/workflow/scripts/downloadZINCsubsets.py
+++ b/workflow/scripts/downloadZINCsubsets.py
@@ -1,11 +1,13 @@
-'''download ligand subsets in mol2 format from ZINC database'''
+"""download ligand subsets in mol2 format from ZINC database"""
 
 import requests
 
 out = snakemake.output[0]
 subset = snakemake.params.sub
 
-url = ''.join(["https://zinc15.docking.org/substances/subsets/", subset, ".mol2?count=all"])
+url = "".join(
+    ["https://zinc15.docking.org/substances/subsets/", subset, ".mol2?count=all"]
+)
 
 r = requests.get(url, timeout=60)
 with open(out, "wb") as outfile:
diff --git a/workflow/scripts/generateIRODS.py b/workflow/scripts/generateIRODS.py
index 4161460..cbfc3ca 100755
--- a/workflow/scripts/generateIRODS.py
+++ b/workflow/scripts/generateIRODS.py
@@ -3,7 +3,7 @@
 import json
 import pandas as pd
 
-target1 =  snakemake.config["TARGETS"]
+target1 = snakemake.config["TARGETS"]
 targets = snakemake.config["RESCREENING_TARGETS"]
 
 name = snakemake.config["EXPERIMENT_NAME"]
@@ -37,5 +37,5 @@
 
 out_dict.update(results)
 
-with open(outfile, "w", encoding = "utf-8") as f:
-    json.dump(out_dict, f, sort_keys = False, indent = 4)
+with open(outfile, "w", encoding="utf-8") as f:
+    json.dump(out_dict, f, sort_keys=False, indent=4)
diff --git a/workflow/scripts/makeHistogram.py b/workflow/scripts/makeHistogram.py
index c8ef6ac..5ed5ffc 100644
--- a/workflow/scripts/makeHistogram.py
+++ b/workflow/scripts/makeHistogram.py
@@ -5,12 +5,14 @@
 import pandas as pd
 from matplotlib import pyplot as plt
 
+
 def pairwise(iterable):
     "s -> (s0,s1), (s1,s2), (s2, s3), ..."
-    a, b= itertools.tee(iterable)
+    a, b = itertools.tee(iterable)
     next(b, None)
     return zip(a, b)
 
+
 def getHistogram(in_pdbqt, low, high, step):
     """creates histogram of strutures in_pdbqt.
 
@@ -25,15 +27,15 @@ def getHistogram(in_pdbqt, low, high, step):
     step : float
         size of binding energy buckets.
     """
-    f = gzip.open(in_pdbqt, 'r')
+    f = gzip.open(in_pdbqt, "r")
     lowerBound = low
     bins = []
-    list1 =[]
+    list1 = []
     while lowerBound < high:
         bins.append(lowerBound)
         lowerBound = lowerBound + step
     bins = [round(num, 2) for num in bins]
-    for a,b in pairwise(f):
+    for a, b in pairwise(f):
         if "REMARK VINA RESULT:" in str(b):
             if "MODEL 1" in str(a):
                 tmp = b.split()
@@ -46,6 +48,7 @@ def getHistogram(in_pdbqt, low, high, step):
     plt.ylabel("frequency")
     fig = plt.gcf()
     fig.set_size_inches(10, 5)
-    plt.savefig(snakemake.output[0], dpi = 300)
+    plt.savefig(snakemake.output[0], dpi=300)
+
 
 getHistogram(snakemake.input[0], -13, -3, 0.2)
diff --git a/workflow/scripts/mergeOutput.py b/workflow/scripts/mergeOutput.py
index be0d6ec..afdab83 100644
--- a/workflow/scripts/mergeOutput.py
+++ b/workflow/scripts/mergeOutput.py
@@ -1,18 +1,21 @@
 """merge all docking output files together"""
+
 import shutil
 import subprocess
 
 outFile = snakemake.output[0]
 inFiles = snakemake.input
 
-#TODO: change to ZIPFILE to support better result portability
-with open(outFile, 'wb') as outF:
+# TODO: change to ZIPFILE to support better result portability
+with open(outFile, "wb") as outF:
     for file in inFiles:
         try:
-            out = subprocess.check_output(['gunzip', '-t', file])  #test if gzip file is valid before merging
+            out = subprocess.check_output(
+                ["gunzip", "-t", file]
+            )  # test if gzip file is valid before merging
 
-            #TODO: avoid overwriting the file descriptor!
-            with open(file, 'rb') as sourceFile:
+            # TODO: avoid overwriting the file descriptor!
+            with open(file, "rb") as sourceFile:
                 shutil.copyfileobj(sourceFile, outF)
         except subprocess.CalledProcessError:
-            print(f'Erros occured during the docking of {file}')
+            print(f"Erros occured during the docking of {file}")
diff --git a/workflow/scripts/prepareDocking.py b/workflow/scripts/prepareDocking.py
index d80f4dd..80ed987 100644
--- a/workflow/scripts/prepareDocking.py
+++ b/workflow/scripts/prepareDocking.py
@@ -2,6 +2,7 @@
 preparation of ligand input file for VinaLC containing the path
  to every ligand with same weigth + logP
  """
+
 import os
 
 input_directory = snakemake.input.in_dir
@@ -15,7 +16,13 @@
                 for purchase in snakemake.config["ZINC_INPUT"]["PURCHASE"]:
                     for ph in snakemake.config["ZINC_INPUT"]["PH"]:
                         for charge in snakemake.config["ZINC_INPUT"]["CHARGE"]:
-                            fname = weightlog + react + purchase + ph + charge + ".pdbqt"
+                            fname = (
+                                weightlog + react + purchase + ph + charge + ".pdbqt"
+                            )
                             ligand_file = os.path.join(input_directory, fname)
-                            with open(os.path.join(snakemake.output.library ),"r", encoding='utf-8') as file_object:
+                            with open(
+                                os.path.join(snakemake.output.library),
+                                "r",
+                                encoding="utf-8",
+                            ) as file_object:
                                 file_object.write(ligand_file + "\n")
diff --git a/workflow/scripts/prepareReceptor.py b/workflow/scripts/prepareReceptor.py
index 31bb4e0..239b81c 100644
--- a/workflow/scripts/prepareReceptor.py
+++ b/workflow/scripts/prepareReceptor.py
@@ -3,6 +3,7 @@
 import os
 from Bio.PDB import PDBParser, PDBIO
 
+
 def removeChains(model, chainlist):
     """
     removes chains not specified in chainlist from model
@@ -20,7 +21,7 @@ def removeChains(model, chainlist):
     chain_to_remove = []
     for chain in model:
         for residue in chain:
-            if residue.id[0] != ' ':
+            if residue.id[0] != " ":
                 residue_to_remove.append((chain.id, residue.id))
         if not chain:
             chain_to_remove.append(chain.id)
@@ -39,9 +40,9 @@ def prepareRec(inputfile, outputfile, target):
     select chains to delete depending on config definition
     """
     print(target)
-    ID = target.split(',')
+    ID = target.split(",")
     chains = ID[1].split(" ")
-    parser = PDBParser()#MMCIFParser()
+    parser = PDBParser()  # MMCIFParser()
     structure = parser.get_structure(ID[0], inputfile)
     model = structure[0]
     removeChains(model, chains)
@@ -52,12 +53,13 @@ def prepareRec(inputfile, outputfile, target):
     print("printing outfile")
     io.save(out)
 
-head,tail = os.path.split(snakemake.input[0])
+
+head, tail = os.path.split(snakemake.input[0])
 filename = tail.split(".")[0]
 
 if any(filename in target for target in snakemake.config["TARGETS"]):
-    print('filename in targets')
-    prepareRec(snakemake.input[0],  snakemake.output[0], snakemake.config["TARGETS"][0])
+    print("filename in targets")
+    prepareRec(snakemake.input[0], snakemake.output[0], snakemake.config["TARGETS"][0])
 
 elif filename in str(snakemake.config["RESCREENING_TARGETS"]):
     for target in snakemake.config["RESCREENING_TARGETS"]:
diff --git a/workflow/scripts/sortResult.py b/workflow/scripts/sortResult.py
index bd6afe2..6506e16 100644
--- a/workflow/scripts/sortResult.py
+++ b/workflow/scripts/sortResult.py
@@ -1,19 +1,21 @@
 """sorting and outputting the N>1 best results from a docking run"""
+
 import subprocess
 import gzip
 import os
 from math import ceil
+
 inPath = snakemake.input[0]
 outFile = snakemake.output[0]
 num = float(snakemake.config["RESULT_NUMBER"])
 
-count=0
-lineN=0
-ids=0
-swapFlg=False
-scoreList=[]
-strucList=[]
-tempList=[]
+count = 0
+lineN = 0
+ids = 0
+swapFlg = False
+scoreList = []
+strucList = []
+tempList = []
 
 getLigNum = "zgrep -c 'REMARK RECEPTOR' " + inPath
 ligand_num = int(subprocess.check_output(getLigNum, shell=True))
@@ -21,42 +23,42 @@
 if num > 1:
     listSize = num
 else:
-    total=float(ligand_num)
-    listSize = ceil(num*total)
+    total = float(ligand_num)
+    listSize = ceil(num * total)
 
-with gzip.open(inPath, 'rt', encoding='utf-8') as inFile:
+with gzip.open(inPath, "rt", encoding="utf-8") as inFile:
     for line in inFile:
         if "REMARK RECEPTOR" in line:
-            lineN=0
-            if count>0:
-                if count<=listSize:
+            lineN = 0
+            if count > 0:
+                if count <= listSize:
                     strucList.append(tempList)
                 elif swapFlg:
-                    strucList[ids]=tempList
-                    swapFlg=False
-            count=count+1
-            tempList=[]
-        if lineN==3:
-            strs=line.split()
-            curValue=float(strs[3])
-            if count<=listSize:
+                    strucList[ids] = tempList
+                    swapFlg = False
+            count = count + 1
+            tempList = []
+        if lineN == 3:
+            strs = line.split()
+            curValue = float(strs[3])
+            if count <= listSize:
                 scoreList.append(curValue)
             else:
-                maxValue=max(scoreList)
-                if curValue<maxValue:
-                    swapFlg=True
-                    ids=scoreList.index(maxValue)
-                    scoreList[ids]=curValue
+                maxValue = max(scoreList)
+                if curValue < maxValue:
+                    swapFlg = True
+                    ids = scoreList.index(maxValue)
+                    scoreList[ids] = curValue
         tempList.append(line)
-        lineN=lineN+1
-    scoreDict={}
+        lineN = lineN + 1
+    scoreDict = {}
 
     for index, item in enumerate(scoreList):
-        scoreDict[index]=item
+        scoreDict[index] = item
 
-    sortList=sorted(scoreDict, key=scoreDict.get)
-    with open(outFile, 'w', encoding='utf-8') as outF:
+    sortList = sorted(scoreDict, key=scoreDict.get)
+    with open(outFile, "w", encoding="utf-8") as outF:
         for index in sortList:
             for line in strucList[index]:
                 outF.write(line)
-os.chmod(outFile, 0o400) ## TODO: remove after included in Snakemake
+os.chmod(outFile, 0o400)  ## TODO: remove after included in Snakemake
diff --git a/workflow/scripts/splitFile.py b/workflow/scripts/splitFile.py
index 6a17438..956036c 100644
--- a/workflow/scripts/splitFile.py
+++ b/workflow/scripts/splitFile.py
@@ -7,17 +7,21 @@
 # max. input molecule per output file
 n = 15000
 
-with open(snakemake.input[0], 'r', encoding='utf-8') as infile:
+with open(snakemake.input[0], "r", encoding="utf-8") as infile:
     intext = infile.read()  # read the entire file into memory
 
-nligands = intext.split('ENDMDL')
+nligands = intext.split("ENDMDL")
 # number of output files
 file_number = (len(nligands) // n) + 1
 
 for j in range(file_number):
     if not os.path.exists(snakemake.output[0]):
         os.makedirs(snakemake.output[0], exist_ok=True)
-    with open(os.path.join(snakemake.output[0], "".join([str(j), ".pdbqt"])), 'w', encoding='utf-8') as outfile:
+    with open(
+        os.path.join(snakemake.output[0], "".join([str(j), ".pdbqt"])),
+        "w",
+        encoding="utf-8",
+    ) as outfile:
         count = 0
         for i in nligands:
             if n * j <= count < n * (j + 1):
diff --git a/workflow/scripts/union_venn.py b/workflow/scripts/union_venn.py
index 73f4e4a..240f681 100644
--- a/workflow/scripts/union_venn.py
+++ b/workflow/scripts/union_venn.py
@@ -1,4 +1,4 @@
-''''safes table with union ligands of all rescreening results'''
+"""'safes table with union ligands of all rescreening results"""
 
 from pathlib import Path
 import itertools
@@ -10,42 +10,45 @@
 
 rescreen_results = snakemake.input.re_results
 
+
 def pairwise(iterable):
     "s -> (s0,s1), (s1,s2), (s2, s3), ..."
-    a, b= itertools.tee(iterable)
+    a, b = itertools.tee(iterable)
     next(b, None)
     return zip(a, b)
 
+
 def group3(iterable):
-    '''iterates over 3 lines at once'''
+    """iterates over 3 lines at once"""
     a, b, c = itertools.tee(iterable, 3)
     next(b, None)
     next(c, None)
     next(c, None)
     return zip(a, b, c)
 
+
 def extract_pdbqt(infile):
     """
     :param infile: path to infile
     :param outfile: path to outfile
     :return: receptor name, IDs and enthalpies
     """
-    target_IDs  = []
-    enthalpies  = []
-    references  = set()
-    with open(infile, encoding='utf-8') as to_parse:
+    target_IDs = []
+    enthalpies = []
+    references = set()
+    with open(infile, encoding="utf-8") as to_parse:
         # this line reads: 'REMARK RECEPTOR path/to/target.pdbqt'
         headerline = to_parse.readline()
         # we want to extract the target name WITHOUT the suffix
-        receptor   = Path(headerline.split(' ')[-1]).stem
+        receptor = Path(headerline.split(" ")[-1]).stem
         # we then proceed extracting the enthalpy per ligand
         # we iterate two line per iteration, because the file is structured, like:
         #
         # REMARK VINA RESULT:  Enthalpy 1 Enthalpy 2 Enthalpy 3
         # REMARK  Name = <name>
         for i, j in pairwise(to_parse):
-            if 'Name' in j:
-                ID = j.split(' = ')[-1].strip(linesep)
+            if "Name" in j:
+                ID = j.split(" = ")[-1].strip(linesep)
                 if ID not in references:
                     references.add(ID)
                     target_IDs.append(ID)
@@ -53,28 +56,29 @@ def extract_pdbqt(infile):
 
     return receptor, target_IDs, enthalpies
 
+
 def extract_pdbqt_rescreening(infile):
     """
     :param infile: path to infile
     :param outfile: path to outfile
     :return: receptor name, IDs and enthalpies
     """
-    target_IDs  = []
-    enthalpies  = []
-    references  = set()
-    with open(infile, encoding='utf-8') as to_parse:
+    target_IDs = []
+    enthalpies = []
+    references = set()
+    with open(infile, encoding="utf-8") as to_parse:
         # this line reads: 'REMARK RECEPTOR path/to/target.pdbqt'
         headerline = to_parse.readline()
         # we want to extract the target name WITHOUT the suffix
-        receptor   = Path(headerline.split(' ')[-1]).stem
+        receptor = Path(headerline.split(" ")[-1]).stem
         # we then proceed extracting the enthalpy per ligand
         # we iterate two line per iteration, because the file is structured, like:
         #
         # REMARK VINA RESULT:  Enthalpy 1 Enthalpy 2 Enthalpy 3
         # REMARK  Name = <name>
         for a, _, j in group3(to_parse):
-            if 'Name' in j:
-                ID = j.split(' = ')[-1].strip(linesep)
+            if "Name" in j:
+                ID = j.split(" = ")[-1].strip(linesep)
                 if ID not in references:
                     references.add(ID)
                     target_IDs.append(ID)
@@ -82,50 +86,55 @@ def extract_pdbqt_rescreening(infile):
 
     return receptor, target_IDs, enthalpies
 
+
 def unionVenn(bestLigandFile, outFile, reBest):
-    '''outputs union table of '''
+    """outputs union table of"""
     ID = []
     value = []
     receptor, ID, value = extract_pdbqt(bestLigandFile)
 
-    out_df = pd.DataFrame(value, index = ID, columns = [receptor])
+    out_df = pd.DataFrame(value, index=ID, columns=[receptor])
 
     inputArray = []
 
     for inFile in reBest:
         inputArray.append(inFile)
 
-    for i in inputArray: #join all rescreening results to first screening
+    for i in inputArray:  # join all rescreening results to first screening
         receptor, ID, value = extract_pdbqt_rescreening(i)
-        df = pd.DataFrame(value, index = ID, columns = [receptor])
-        out_df = out_df.join(df, how = 'inner')
+        df = pd.DataFrame(value, index=ID, columns=[receptor])
+        out_df = out_df.join(df, how="inner")
     out_df.to_csv(outFile)
 
+
 def getLigands(result_file):
-    """ take results file and outputs ligand names and receptor name """
+    """take results file and outputs ligand names and receptor name"""
     ligandNames = []
-    with open(result_file, encoding = 'utf-8') as result:
+    with open(result_file, encoding="utf-8") as result:
         for line in result:
-            if 'REMARK RECEPTOR' in line:
-                receptor = line.split('/')[-1]
-            if 'Name' in line:
+            if "REMARK RECEPTOR" in line:
+                receptor = line.split("/")[-1]
+            if "Name" in line:
                 ligandNames.append(line.split()[-1])
     return ligandNames, receptor
+
+
 def makeVenn(result_list):
     """takes rescreening results and outputs a venn diagram"""
     ligand_lists = []
     receptor_list = []
     for result in result_list:
-        ligands,receptor = getLigands(result)
+        ligands, receptor = getLigands(result)
         ligand_lists.append(ligands)
         receptor_list.append(receptor)
-    venn.venn(ligand_lists, snakemake.output[1], receptor_list, figsize=(16,16))
+    venn.venn(ligand_lists, snakemake.output[1], receptor_list, figsize=(16, 16))
+
 
-#rescreen_results = str(rescreen_results).split()
+# rescreen_results = str(rescreen_results).split()
 
 
 if 1 < len(rescreen_results) <= 4:
     makeVenn(rescreen_results)
 else:
-    open(snakemake.output[1], 'a', encoding = 'utf-8').close()
-unionVenn( snakemake.input[0], snakemake.output[0], rescreen_results)
+    open(snakemake.output[1], "a", encoding="utf-8").close()
+unionVenn(snakemake.input[0], snakemake.output[0], rescreen_results)
diff --git a/workflow/scripts/venn.py b/workflow/scripts/venn.py
index 58abe8f..aa50ca3 100644
--- a/workflow/scripts/venn.py
+++ b/workflow/scripts/venn.py
@@ -1,16 +1,17 @@
 """ creating venn diagrams"""
 
-
 from itertools import chain
 from collections.abc import Iterable
 from matplotlib.patches import Circle, Ellipse
 import pylab
+
 # --------------------------------------------------------------------
-alignment = {'horizontalalignment': 'center', 'verticalalignment': 'baseline'}
+alignment = {"horizontalalignment": "center", "verticalalignment": "baseline"}
 
 
 # -------------------------------------------------------------------
 
+
 def venn(data, outfile, names=None, fill="number", **kwds):
     """
     data: a list
@@ -23,11 +24,11 @@ def venn(data, outfile, names=None, fill="number", **kwds):
     if data is None:
         raise Exception("No data!")
     if len(data) == 2:
-        venn2(data, outfile, names, fill,**kwds)
+        venn2(data, outfile, names, fill, **kwds)
     elif len(data) == 3:
-        venn3(data, outfile, names, fill,**kwds)
+        venn3(data, outfile, names, fill, **kwds)
     elif len(data) == 4:
-        venn4(data, outfile, names, fill,**kwds)
+        venn4(data, outfile, names, fill, **kwds)
     else:
         raise Exception("currently only 2-4 sets venn diagrams are supported")
 
@@ -56,15 +57,15 @@ def get_labels(data, fill="number"):
     N = len(data)
 
     sets_data = [set(data[i]) for i in range(N)]  # sets for separate groups
-    s_all = set(chain(*data))                             # union of all sets
+    s_all = set(chain(*data))  # union of all sets
 
     # bin(3) --> '0b11', so bin(3).split('0b')[-1] will remove "0b"
     set_collections = {}
     for n in range(1, 2**N):
-        key = bin(n).split('0b')[-1].zfill(N)
+        key = bin(n).split("0b")[-1].zfill(N)
         value = s_all
-        sets_for_intersection = [sets_data[i] for i in range(N) if key[i] == '1']
-        sets_for_difference = [sets_data[i] for i in range(N) if key[i] == '0']
+        sets_for_intersection = [sets_data[i] for i in range(N) if key[i] == "1"]
+        sets_for_difference = [sets_data[i] for i in range(N) if key[i] == "0"]
         for s in sets_for_intersection:
             value = value & s
         for s in sets_for_difference:
@@ -80,6 +81,7 @@ def get_labels(data, fill="number"):
 
     return labels
 
+
 # --------------------------------------------------------------------
 
 
@@ -94,9 +96,9 @@ def venn2(data, outfile, names=None, fill="number", **kwds):
     labels = get_labels(data, fill=fill)
 
     # set figure size
-    if 'figsize' in kwds and len(kwds['figsize']) == 2:
+    if "figsize" in kwds and len(kwds["figsize"]) == 2:
         # if 'figsize' is in kwds, and it is a list or tuple with length of 2
-        figsize = kwds['figsize']
+        figsize = kwds["figsize"]
     else:  # default figure size
         figsize = (8, 8)
 
@@ -109,10 +111,14 @@ def venn2(data, outfile, names=None, fill="number", **kwds):
     ax.set_ylim(0, 8)
 
     # set colors for different Circles or ellipses
-    if 'colors' in kwds and isinstance(kwds['colors'], Iterable) and len(kwds['colors']) >= 2:
-        colors = kwds['colors']
+    if (
+        "colors" in kwds
+        and isinstance(kwds["colors"], Iterable)
+        and len(kwds["colors"]) >= 2
+    ):
+        colors = kwds["colors"]
     else:
-        colors = ['red', 'green']
+        colors = ["red", "green"]
 
     c1 = Circle((3.0, 4.0), radius=2.0, alpha=0.5, color=colors[0])
     c2 = Circle((5.0, 4.0), radius=2.0, alpha=0.5, color=colors[1])
@@ -122,19 +128,21 @@ def venn2(data, outfile, names=None, fill="number", **kwds):
 
     # draw text
     # 1
-    pylab.text(2.0, 4.0, labels['10'], **alignment)
-    pylab.text(2.5, 4.0, labels['01'], **alignment)
+    pylab.text(2.0, 4.0, labels["10"], **alignment)
+    pylab.text(2.5, 4.0, labels["01"], **alignment)
     # 2
-    pylab.text(4.0, 4.0, labels['11'], **alignment)
+    pylab.text(4.0, 4.0, labels["11"], **alignment)
     # names of different groups
     if outfile:
-        pylab.text(3.0, 4.0-1.2*2.0, names[0], fontsize=16, **alignment)
-        pylab.text(5.0, 4.0-1.2*2.0, names[1], fontsize=16, **alignment)
+        pylab.text(3.0, 4.0 - 1.2 * 2.0, names[0], fontsize=16, **alignment)
+        pylab.text(5.0, 4.0 - 1.2 * 2.0, names[1], fontsize=16, **alignment)
 
-    leg = ax.legend(names, loc='best', fancybox=True)
+    leg = ax.legend(names, loc="best", fancybox=True)
     leg.get_frame().set_alpha(0.5)
 
     pylab.savefig(outfile)
+
+
 # --------------------------------------------------------------------
 
 
@@ -148,15 +156,15 @@ def venn3(data, outfile, names=None, fill="number", **kwds):
     labels = get_labels(data, fill=fill)
 
     # set figure size
-    if 'figsize' in kwds and len(kwds['figsize']) == 2:
+    if "figsize" in kwds and len(kwds["figsize"]) == 2:
         # if 'figsize' is in kwds, and it is a list or tuple with length of 2
-        figsize = kwds['figsize']
+        figsize = kwds["figsize"]
     else:  # default figure size
         figsize = (10, 10)
 
-    fig = pylab.figure(figsize=figsize)   # set figure size
+    fig = pylab.figure(figsize=figsize)  # set figure size
     ax = fig.gca()
-    ax.set_aspect("equal")                # set aspect ratio to 1
+    ax.set_aspect("equal")  # set aspect ratio to 1
     ax.set_xticks([])
     ax.set_yticks([])
     ax.set_xlim(0, 8)
@@ -164,7 +172,7 @@ def venn3(data, outfile, names=None, fill="number", **kwds):
 
     # set colors for different Circles or ellipses
 
-    colors = ['red', 'green', 'blue']
+    colors = ["red", "green", "blue"]
 
     c1 = Circle((3.0, 3.0), radius=2.0, alpha=0.5, color=colors[0])
     c2 = Circle((5.0, 3.0), radius=2.0, alpha=0.5, color=colors[1])
@@ -174,22 +182,22 @@ def venn3(data, outfile, names=None, fill="number", **kwds):
 
     # draw text
     # 1
-    pylab.text(3.0-2/2, 3.0-2/2, labels['100'], **alignment)
-    pylab.text(5.0+2/2, 3.0-2/2, labels['010'], **alignment)
-    pylab.text((3.0+5.0)/2, 4.73+2/2, labels['001'], **alignment)
+    pylab.text(3.0 - 2 / 2, 3.0 - 2 / 2, labels["100"], **alignment)
+    pylab.text(5.0 + 2 / 2, 3.0 - 2 / 2, labels["010"], **alignment)
+    pylab.text((3.0 + 5.0) / 2, 4.73 + 2 / 2, labels["001"], **alignment)
     # 2
-    pylab.text((3.0+5.0)/2, 3.0-2/2, labels['110'], **alignment)
-    pylab.text(3.0, 3.0+2*2/3, labels['101'], **alignment)
-    pylab.text(5.0, 3.0+2*2/3, labels['011'], **alignment)
+    pylab.text((3.0 + 5.0) / 2, 3.0 - 2 / 2, labels["110"], **alignment)
+    pylab.text(3.0, 3.0 + 2 * 2 / 3, labels["101"], **alignment)
+    pylab.text(5.0, 3.0 + 2 * 2 / 3, labels["011"], **alignment)
     # 3
-    pylab.text((3.0+5.0)/2, 3.0+2/3, labels['111'], **alignment)
+    pylab.text((3.0 + 5.0) / 2, 3.0 + 2 / 3, labels["111"], **alignment)
     # names of different groups
     if outfile:
         pylab.text(1.0, 1.0, names[0], fontsize=16, **alignment)
         pylab.text(7.0, 1.0, names[1], fontsize=16, **alignment)
-        pylab.text(4.0, 4.73+1.2*2, names[2], fontsize=16, **alignment)
+        pylab.text(4.0, 4.73 + 1.2 * 2, names[2], fontsize=16, **alignment)
 
-    leg = ax.legend(names, loc='best', fancybox=True)
+    leg = ax.legend(names, loc="best", fancybox=True)
     leg.get_frame().set_alpha(0.5)
 
     pylab.savefig(outfile)
@@ -203,20 +211,24 @@ def venn4(data, outfile, names=None, fill="number", **kwds):
     labels = get_labels(data, fill=fill)
 
     # set figure size
-    if 'figsize' in kwds and len(kwds['figsize']) == 2:
+    if "figsize" in kwds and len(kwds["figsize"]) == 2:
         # if 'figsize' is in kwds, and it is a list or tuple with length of 2
-        figsize = kwds['figsize']
+        figsize = kwds["figsize"]
     else:  # default figure size
         figsize = (10, 10)
 
     # set colors for different Circles or ellipses
-    if 'colors' in kwds and isinstance(kwds['colors'], Iterable) and len(kwds['colors']) >= 4:
-        colors = kwds['colors']
+    if (
+        "colors" in kwds
+        and isinstance(kwds["colors"], Iterable)
+        and len(kwds["colors"]) >= 4
+    ):
+        colors = kwds["colors"]
     else:
-        colors = ['r', 'g', 'b', 'c']
+        colors = ["r", "g", "b", "c"]
 
     # draw ellipse, the coordinates are hard coded in the rest of the function
-    fig = pylab.figure(figsize=figsize)   # set figure size
+    fig = pylab.figure(figsize=figsize)  # set figure size
     ax = fig.gca()
     patches = []
     width, height = 170, 110  # width and height of the ellipses
@@ -234,24 +246,24 @@ def venn4(data, outfile, names=None, fill="number", **kwds):
 
     # draw text
     # 1
-    pylab.text(120, 200, labels['1000'], **alignment)
-    pylab.text(280, 200, labels['0100'], **alignment)
-    pylab.text(155, 250, labels['0010'], **alignment)
-    pylab.text(245, 250, labels['0001'], **alignment)
+    pylab.text(120, 200, labels["1000"], **alignment)
+    pylab.text(280, 200, labels["0100"], **alignment)
+    pylab.text(155, 250, labels["0010"], **alignment)
+    pylab.text(245, 250, labels["0001"], **alignment)
     # 2
-    pylab.text(200, 115, labels['1100'], **alignment)
-    pylab.text(140, 225, labels['1010'], **alignment)
-    pylab.text(145, 155, labels['1001'], **alignment)
-    pylab.text(255, 155, labels['0110'], **alignment)
-    pylab.text(260, 225, labels['0101'], **alignment)
-    pylab.text(200, 240, labels['0011'], **alignment)
+    pylab.text(200, 115, labels["1100"], **alignment)
+    pylab.text(140, 225, labels["1010"], **alignment)
+    pylab.text(145, 155, labels["1001"], **alignment)
+    pylab.text(255, 155, labels["0110"], **alignment)
+    pylab.text(260, 225, labels["0101"], **alignment)
+    pylab.text(200, 240, labels["0011"], **alignment)
     # 3
-    pylab.text(235, 205, labels['0111'], **alignment)
-    pylab.text(165, 205, labels['1011'], **alignment)
-    pylab.text(225, 135, labels['1101'], **alignment)
-    pylab.text(175, 135, labels['1110'], **alignment)
+    pylab.text(235, 205, labels["0111"], **alignment)
+    pylab.text(165, 205, labels["1011"], **alignment)
+    pylab.text(225, 135, labels["1101"], **alignment)
+    pylab.text(175, 135, labels["1110"], **alignment)
     # 4
-    pylab.text(200, 175, labels['1111'], **alignment)
+    pylab.text(200, 175, labels["1111"], **alignment)
     # names of different groups
     if outfile:
         pylab.text(110, 110, names[0], fontsize=16, **alignment)
@@ -259,7 +271,7 @@ def venn4(data, outfile, names=None, fill="number", **kwds):
         pylab.text(130, 275, names[2], fontsize=16, **alignment)
         pylab.text(270, 275, names[3], fontsize=16, **alignment)
 
-    leg = ax.legend(names, loc='best', fancybox=True)
+    leg = ax.legend(names, loc="best", fancybox=True)
     leg.get_frame().set_alpha(0.5)
 
     pylab.savefig(outfile)

From 043abb5584981e10b71cd7ea116f41b10752eeb3 Mon Sep 17 00:00:00 2001
From: meesters <meesters@uni-mainz.de>
Date: Wed, 6 Aug 2025 13:55:03 +0200
Subject: [PATCH 2/3] refactor: Snakefile

---
 workflow/Snakefile | 141 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 102 insertions(+), 39 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index ecdaa75..4287547 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -3,19 +3,19 @@ from os import path
 
 from snakemake.utils import min_version
 
-min_version("7.19.1") # this is where SLURM support was introduced
+min_version("7.19.1")  # this is where SLURM support was introduced
 
-INPUT_DIR=config["INPUT_DIR"]
+INPUT_DIR = config["INPUT_DIR"]
 
-MIN_DIR=config["PREPARED_LIGAND_DIR"]
+MIN_DIR = config["PREPARED_LIGAND_DIR"]
 
-PREPARED_DIR=config["PREPARED_DATA_DIR"]
+PREPARED_DIR = config["PREPARED_DATA_DIR"]
 
-OUTPUT_DIR=config["OUTPUT_DIR"]
+OUTPUT_DIR = config["OUTPUT_DIR"]
 
-TMP_DIR=config["TEMP_DATA_DIR"]
+TMP_DIR = config["TEMP_DATA_DIR"]
 
-LOCAL_INPUT =config["LOCAL_INPUT_DIR"]
+LOCAL_INPUT = config["LOCAL_INPUT_DIR"]
 
 DATABASE = config["DATABASE"]
 
@@ -23,77 +23,140 @@ SUBSET = config["SUBSET"]
 
 RESCREENING_TARGETS = config["RESCREENING_TARGETS"]
 
+
 def generateOutput(wildcards):
     irods = path.join(OUTPUT_DIR, "results", "irods.json")
     if config["RESCREENING"] == "TRUE":
-        out = expand(path.join(OUTPUT_DIR, "results", "rescreening_{percentage}", "{receptorID}", "union.csv"),
-            receptorID = config["TARGETS"][0].split(',')[0],
-            percentage = config["RESULT_NUMBER"],
-            combAll = combAll)
-        hist = expand(path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"),
-            receptorID = config["TARGETS"][0].split(',')[0])
+        out = expand(
+            path.join(
+                OUTPUT_DIR,
+                "results",
+                "rescreening_{percentage}",
+                "{receptorID}",
+                "union.csv",
+            ),
+            receptorID=config["TARGETS"][0].split(",")[0],
+            percentage=config["RESULT_NUMBER"],
+            combAll=combAll,
+        )
+        hist = expand(
+            path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"),
+            receptorID=config["TARGETS"][0].split(",")[0],
+        )
 
-        return hist + out  + [irods]
+        return hist + out + [irods]
 
     else:
-        out = expand(path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"),
-            receptorID = config["TARGETS"][0].split(',')[0],
-            percentage = config["RESULT_NUMBER"])
-        hist = expand(path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"),
-            receptorID = config["TARGETS"][0].split(',')[0])
+        out = expand(
+            path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"),
+            receptorID=config["TARGETS"][0].split(",")[0],
+            percentage=config["RESULT_NUMBER"],
+        )
+        hist = expand(
+            path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"),
+            receptorID=config["TARGETS"][0].split(",")[0],
+        )
         return hist + out + [irods]
 
 
-localrules: all, generateIRODS, dockingResultsTxt, removeDuplicateLigands, makeVenn, prepareLigands2, mergeDocking2, bestLigands, prepareSecondDocking, convertMol2, makeReceptorPDBQT, mergeDocking ,mergeLocalInput, split, split2, targetProtein, getZINCdata, getZINCSubsets, gunzip,  ENAMINEdownload, prepareReceptor, prepareDocking, prepareLibrary, prepareGeometry, makeHistogram, cleanLigands
-
-targetList=[]   #get ProteinIDs from configfile for rescreening
+localrules:
+    all,
+    generateIRODS,
+    dockingResultsTxt,
+    removeDuplicateLigands,
+    makeVenn,
+    prepareLigands2,
+    mergeDocking2,
+    bestLigands,
+    prepareSecondDocking,
+    convertMol2,
+    makeReceptorPDBQT,
+    mergeDocking,
+    mergeLocalInput,
+    split,
+    split2,
+    targetProtein,
+    getZINCdata,
+    getZINCSubsets,
+    gunzip,
+    ENAMINEdownload,
+    prepareReceptor,
+    prepareDocking,
+    prepareLibrary,
+    prepareGeometry,
+    makeHistogram,
+    cleanLigands,
+
+
+targetList = []  # get ProteinIDs from configfile for rescreening
 for i in config["RESCREENING_TARGETS"]:
-    targetList.append(i.split(',')[0])
+    targetList.append(i.split(",")[0])
 
 combList = []
 for comb in itertools.combinations(targetList, 2):  # all combinations of two targets
 
-    combList.append('_'.join(comb))
+    combList.append("_".join(comb))
+
+combAll = "_".join(targetList)  # combine all rescreening targets
 
-combAll = '_'.join(targetList) # combine all rescreening targets
 
 def getAllVenn(wildcards):
-    path.join(OUTPUT_DIR, "output", "rescreening", "{receptorID}", "{RESCREENING_TARGETS}_union.txt")
+    path.join(
+        OUTPUT_DIR,
+        "output",
+        "rescreening",
+        "{receptorID}",
+        "{RESCREENING_TARGETS}_union.txt",
+    )
 
 
 def IRODSinput(wildcards):
     if config["RESCREENING"] == "TRUE":
-        out = expand(path.join(OUTPUT_DIR, "results", "rescreening_{percentage}", "{receptorID}", "union.csv"),
-            receptorID = config["TARGETS"][0].split(',')[0],
-            percentage = config["RESULT_NUMBER"],
-            combAll = combAll)
+        out = expand(
+            path.join(
+                OUTPUT_DIR,
+                "results",
+                "rescreening_{percentage}",
+                "{receptorID}",
+                "union.csv",
+            ),
+            receptorID=config["TARGETS"][0].split(",")[0],
+            percentage=config["RESULT_NUMBER"],
+            combAll=combAll,
+        )
     else:
-        out = expand(path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"),
-            receptorID = config["TARGETS"][0].split(',')[0],
-            percentage = config["RESULT_NUMBER"])
+        out = expand(
+            path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"),
+            receptorID=config["TARGETS"][0].split(",")[0],
+            percentage=config["RESULT_NUMBER"],
+        )
     return out
 
+
 def resources(RULE_NAME):
     out_list = []
     for key in config[RULE_NAME]:
-        out_list.append(f'{key} = {config[RULE_NAME][key]}')
-    out_str = ',\n'.join(out_list)
+        out_list.append(f"{key} = {config[RULE_NAME][key]}")
+    out_str = ",\n".join(out_list)
     return out_str
 
+
 rule all:
     input:
-        generateOutput
+        generateOutput,
+
 
 rule generateIRODS:
     input:
-        IRODSinput
+        IRODSinput,
     output:
-        path.join(OUTPUT_DIR, "results", "irods.json")
+        path.join(OUTPUT_DIR, "results", "irods.json"),
     log:
-        "logs/generateIRODS.log"
+        "logs/generateIRODS.log",
     script:
         "scripts/generateIRODS.py"
 
+
 include: "rules/analyse.smk"
 include: "rules/docking.smk"
 include: "rules/preparation.smk"

From 8834537c86a3e039b4ff975a525ffc294f7ce3ea Mon Sep 17 00:00:00 2001
From: meesters <meesters@uni-mainz.de>
Date: Wed, 6 Aug 2025 13:56:21 +0200
Subject: [PATCH 3/3] refactor: refactored all rule files

---
 workflow/rules/analyse.smk     | 353 +++++++++++++++++++++++----------
 workflow/rules/docking.smk     |  67 ++++---
 workflow/rules/preparation.smk | 111 +++++++----
 3 files changed, 362 insertions(+), 169 deletions(-)

diff --git a/workflow/rules/analyse.smk b/workflow/rules/analyse.smk
index 61aae9b..eb7b05c 100644
--- a/workflow/rules/analyse.smk
+++ b/workflow/rules/analyse.smk
@@ -4,6 +4,7 @@ import os
 import requests
 from snakemake.logging import logger
 
+
 def url_reachable(url):
     """
     test for reachable URL
@@ -11,30 +12,40 @@ def url_reachable(url):
     r = requests.head(url)
     return r.status_code == 200
 
+
 def getTranches():
-    '''return traches from parsing last log file'''
-    log_files = glob.glob('.snakemake/log/*')
+    """return traches from parsing last log file"""
+    log_files = glob.glob(".snakemake/log/*")
     sorted_files = sorted(log_files, key=os.path.getmtime)
     tranch_list = []
-    pattern =  '[A-K][A-K][ABCEGI][ABCDEF][RMLH][NMLOP]'
+    pattern = "[A-K][A-K][ABCEGI][ABCDEF][RMLH][NMLOP]"
     with open(sorted_files[-2]) as log:
         matches = re.findall(pattern, log.read())
         return list(set(matches))
 
+
 def library(wildcards):
-    if DATABASE[0]=="ZINC": # ZINC database selected
+    if DATABASE[0] == "ZINC":  # ZINC database selected
 
-        if SUBSET=="TRANCHES": # Tranches selected
+        if SUBSET == "TRANCHES":  # Tranches selected
             out = []
             # test for ZINC reachability:
             if not url_reachable("http://files.docking.org/3D/"):
-                logger.info("The ZINC database is not accessible right now. Perhaps it is temporarily down?")
-                user_input = input("Have you already run this workflow in the current folder with the same input data?(y/n) \n")
+                logger.info(
+                    "The ZINC database is not accessible right now. Perhaps it is temporarily down?"
+                )
+                user_input = input(
+                    "Have you already run this workflow in the current folder with the same input data?(y/n) \n"
+                )
                 if user_input == "y":
-                    logger.info("Trying to proceed without accessing the ZINC database (http://files.docking.org)")
-                    tranch_list = getTranches() #get list with 6 letter code specifing ZINC tranches
+                    logger.info(
+                        "Trying to proceed without accessing the ZINC database (http://files.docking.org)"
+                    )
+                    tranch_list = (
+                        getTranches()
+                    )  # get list with 6 letter code specifing ZINC tranches
 
-                    receptorID = config["TARGETS"][0].split(',')[0]
+                    receptorID = config["TARGETS"][0].split(",")[0]
                     database = config["DATABASE"]
 
                     for i in tranch_list:
@@ -47,185 +58,264 @@ def library(wildcards):
                     else:
                         return out
                 else:
-                    logger.error("Aborting the snakemake run for now, as the data from the ZINC database (http://files.docking.org) are temporarily not available. Please try at a later time.")
+                    logger.error(
+                        "Aborting the snakemake run for now, as the data from the ZINC database (http://files.docking.org) are temporarily not available. Please try at a later time."
+                    )
                     sys.exit(1)
-            rawOut= expand(path.join(OUTPUT_DIR, "output", "{receptorID}",
-                "{receptorID}_{database}_{w}{l}_{w}{l}{r}{p}{ph}{c}.pdbqt.gz"),
-                receptorID = config["TARGETS"][0].split(',')[0],
-                database = config["DATABASE"],
-                w  = config["ZINC_INPUT"]["WEIGHT"],
-                l  = config["ZINC_INPUT"]["LOGP"],
-                r  = config["ZINC_INPUT"]["REACT"],
-                p  = config["ZINC_INPUT"]["PURCHASE"],
-                ph = config["ZINC_INPUT"]["PH"],
-                c  = config["ZINC_INPUT"]["CHARGE"],
-                dataset = (config["ZINC_INPUT"]["WEIGHT"]) + (config["ZINC_INPUT"]["LOGP"]))
+            rawOut = expand(
+                path.join(
+                    OUTPUT_DIR,
+                    "output",
+                    "{receptorID}",
+                    "{receptorID}_{database}_{w}{l}_{w}{l}{r}{p}{ph}{c}.pdbqt.gz",
+                ),
+                receptorID=config["TARGETS"][0].split(",")[0],
+                database=config["DATABASE"],
+                w=config["ZINC_INPUT"]["WEIGHT"],
+                l=config["ZINC_INPUT"]["LOGP"],
+                r=config["ZINC_INPUT"]["REACT"],
+                p=config["ZINC_INPUT"]["PURCHASE"],
+                ph=config["ZINC_INPUT"]["PH"],
+                c=config["ZINC_INPUT"]["CHARGE"],
+                dataset=(config["ZINC_INPUT"]["WEIGHT"])
+                + (config["ZINC_INPUT"]["LOGP"]),
+            )
             for i in rawOut:
                 weighLog = i.split("_")[-2]
                 restAttr = (i.split("_")[-1])[2:6]
-                url = "".join(['http://files.docking.org/3D/', weighLog,'/', restAttr, '/', weighLog,restAttr,'.xaa.pdbqt.gz'])
+                url = "".join(
+                    [
+                        "http://files.docking.org/3D/",
+                        weighLog,
+                        "/",
+                        restAttr,
+                        "/",
+                        weighLog,
+                        restAttr,
+                        ".xaa.pdbqt.gz",
+                    ]
+                )
                 r = requests.get(url)
                 if r.status_code == 200:
                     out.append(i)
             if not out:
-                logger.error("All selected tranches are empty; select other parameters!")
+                logger.error(
+                    "All selected tranches are empty; select other parameters!"
+                )
                 sys.exit(1)
             else:
                 return out
-        else: # if not tranches select subset
+        else:  # if not tranches select subset
 
-            out = expand(path.join(OUTPUT_DIR, "output", "{receptorID}",
-                "{receptorID}_{database}_subsets_{subset}.pdbqt.gz"),
-                  database = config["DATABASE"],
-                  subset = config["SUBSET"],
-                  receptorID = config["TARGETS"][0].split(',')[0])
+            out = expand(
+                path.join(
+                    OUTPUT_DIR,
+                    "output",
+                    "{receptorID}",
+                    "{receptorID}_{database}_subsets_{subset}.pdbqt.gz",
+                ),
+                database=config["DATABASE"],
+                subset=config["SUBSET"],
+                receptorID=config["TARGETS"][0].split(",")[0],
+            )
 
-            url = "https://zinc15.docking.org/substances/subsets/" + SUBSET + ".mol2?count=all"
+            url = (
+                "https://zinc15.docking.org/substances/subsets/"
+                + SUBSET
+                + ".mol2?count=all"
+            )
             r = requests.get(url)
-            if r.status_code == 200: #test if subset is valid
+            if r.status_code == 200:  # test if subset is valid
                 return out
 
             r_zinc = requests.get("https://zinc15.docking.org/")
-            if r_zinc.status_code != 200: #test if ZINC database is available
-                logger.info("The ZINC database is not accessible right now. Perhaps it is temporarily down?")
-                #if ZINC not available, but dataset is already downloaded --> continue
-                subset_dir = path.join(INPUT_DIR,config["SUBSET"]+'.mol2')
+            if r_zinc.status_code != 200:  # test if ZINC database is available
+                logger.info(
+                    "The ZINC database is not accessible right now. Perhaps it is temporarily down?"
+                )
+                # if ZINC not available, but dataset is already downloaded --> continue
+                subset_dir = path.join(INPUT_DIR, config["SUBSET"] + ".mol2")
                 if os.path.isfile(subset_dir):
                     return out
                 else:
-                    logger.error("Subset is not availiable in the specified data folder. \n Abort snakemake run, try again later")
+                    logger.error(
+                        "Subset is not availiable in the specified data folder. \n Abort snakemake run, try again later"
+                    )
                     sys.exit(1)
 
             else:
-                logger.error( "Invalid subset name!")
+                logger.error("Invalid subset name!")
                 sys.exit(1)
 
-    else: # not ZINC database --> local input data
-        best = expand(path.join(OUTPUT_DIR, "output", "{receptorID}",
-              "{receptorID}_{database}_{dataset}_local.pdbqt.gz"),
-              database = config["DATABASE"],
-              dataset  = config["LOC_DATA"],
-              receptorID = config["TARGETS"][0].split(',')[0])
+    else:  # not ZINC database --> local input data
+        best = expand(
+            path.join(
+                OUTPUT_DIR,
+                "output",
+                "{receptorID}",
+                "{receptorID}_{database}_{dataset}_local.pdbqt.gz",
+            ),
+            database=config["DATABASE"],
+            dataset=config["LOC_DATA"],
+            receptorID=config["TARGETS"][0].split(",")[0],
+        )
         return best
 
+
 rule makeHistogram:
     input:
-        path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz")
+        path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"),
     output:
-        report(path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), category="Histogram")
+        report(
+            path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"),
+            category="Histogram",
+        ),
     log:
-        "logs/makeHistogram_{receptorID}.log"
+        "logs/makeHistogram_{receptorID}.log",
     envmodules:
-        config["PYPLOT"]
+        config["PYPLOT"],
     script:
         "../scripts/makeHistogram.py"
 
+
 rule bestLigands:
     input:
-        library
+        library,
     output:
-        path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz")
+        path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"),
     log:
-        "logs/bestLigands_{receptorID}.log"
+        "logs/bestLigands_{receptorID}.log",
     script:
         "../scripts/mergeOutput.py"
 
+
 rule dockingResults:
     input:
-        path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz")
+        path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"),
     output:
-        path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt")
+        path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"),
     envmodules:
-        config["PYTHON"]
+        config["PYTHON"],
     log:
-        "logs/dockingResults_{receptorID}_{percentage}.log"
+        "logs/dockingResults_{receptorID}_{percentage}.log",
     threads: config["DOCKING_RESULTS"]["threads"]
     resources:
         mem_mb=config["DOCKING_RESULTS"]["mem_mb"],
         runtime=config["DOCKING_RESULTS"]["runtime"],
-        partition=config["DOCKING_RESULTS"]["partition"]
+        partition=config["DOCKING_RESULTS"]["partition"],
     script:
         "../scripts/sortResult.py"
 
+
 rule dockingResultsTxt:
     input:
-        path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt")
+        path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"),
     output:
-        path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv")
+        path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"),
     log:
-        "logs/dockingResultsTxt_{receptorID}_{percentage}.log"
+        "logs/dockingResultsTxt_{receptorID}_{percentage}.log",
     wildcard_constraints:
         receptorID="[^/]+",
-        percentage="[^/]+"
+        percentage="[^/]+",
     script:
         "../scripts/ResultTxt.py"
 
+
 rule removeDuplicateLigands:
     input:
-        path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt")
+        path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"),
     output:
-        path.join(OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt")
+        path.join(
+            OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt"
+        ),
     log:
-        "logs/removeDuplicateLigands_{receptorID}_{percentage}.log"
+        "logs/removeDuplicateLigands_{receptorID}_{percentage}.log",
     shell:
         "sed '/MODEL [2-9]/,/ENDMDL/d' {input} > {output}"
 
+
 checkpoint split2:
     input:
-        path.join(OUTPUT_DIR, "rescreening", "unique",  "{receptorID}_{percentage}.pdbqt")
+        path.join(
+            OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt"
+        ),
     output:
-        directory(os.path.join(TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}"))
+        directory(
+            os.path.join(TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}")
+        ),
     log:
-        "logs/split2_{receptorID}_{percentage}.log"
+        "logs/split2_{receptorID}_{percentage}.log",
     script:
         "../scripts/splitFile.py"
 
+
 rule prepareLigands2:
     input:
-        ligands = path.join(TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}","{i}.pdbqt")
+        ligands=path.join(
+            TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}", "{i}.pdbqt"
+        ),
     output:
-        ligands = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt")
+        ligands=path.join(
+            OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt"
+        ),
     log:
-        "logs/prepareLigands2_{receptorID}_{percentage}_{name}_{i}.log"
+        "logs/prepareLigands2_{receptorID}_{percentage}_{name}_{i}.log",
     shell:
         "echo {input.ligands} > {output.ligands}"
 
+
 rule prepareSecondDocking:
     input:
-        grid = path.join(OUTPUT_DIR,"grid","{name}_grid.txt"),
-        receptor = path.join(PREPARED_DIR,"receptor", "{name}.pdbqt")
+        grid=path.join(OUTPUT_DIR, "grid", "{name}_grid.txt"),
+        receptor=path.join(PREPARED_DIR, "receptor", "{name}.pdbqt"),
     output:
-        grid = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd"),
-        receptor = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec")
+        grid=path.join(
+            OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd"
+        ),
+        receptor=path.join(
+            OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec"
+        ),
     log:
-       "logs/prepareSecondDocking_{name}_{receptorID}_{percentage}.log"
+        "logs/prepareSecondDocking_{name}_{receptorID}_{percentage}.log",
     shell:
         """
         cp {input.grid} {output.grid}
         echo {input.receptor} > {output.receptor}
         """
 
+
 rule docking2:
     input:
-        ligands = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt"),
-        grid = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd"),
-        receptor = path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec")
+        ligands=path.join(
+            OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt"
+        ),
+        grid=path.join(
+            OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd"
+        ),
+        receptor=path.join(
+            OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec"
+        ),
     output:
-        path.join(OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}","{name}.rec_{i}.txt.pdbqt.gz")
+        path.join(
+            OUTPUT_DIR,
+            "rescreening_{percentage}",
+            "{name}_{receptorID}",
+            "{name}.rec_{i}.txt.pdbqt.gz",
+        ),
     log:
-        "logs/docking2_{name}_{receptorID}_{percentage}_{i}.log"
+        "logs/docking2_{name}_{receptorID}_{percentage}_{i}.log",
     params:
-        dir = path.join(OUTPUT_DIR, "rescreening_{percentage}"),
-        gridfile = path.join(config["GRID_DIR"], "{name}.gpf"),
-        cutOff = config["CUTOFF_VALUE"]
+        dir=path.join(OUTPUT_DIR, "rescreening_{percentage}"),
+        gridfile=path.join(config["GRID_DIR"], "{name}.gpf"),
+        cutOff=config["CUTOFF_VALUE"],
     resources:
-        mpi = True,
+        mpi=True,
         partition=config["DOCKING"]["partition"],
         tasks=config["DOCKING"]["ntasks"],
         slurm_extra=config["DOCKING"]["slurm_extra"],
-        runtime=config["DOCKING"]["runtime"]
+        runtime=config["DOCKING"]["runtime"],
     envmodules:
-        config["VINALC"]
+        config["VINALC"],
     shell:
         """(
         cd {params.dir}/{wildcards.name}_{wildcards.receptorID}
@@ -235,47 +325,102 @@ rule docking2:
         cd -
         )"""
 
+
 def aggregate_in2(wildcards):
     checkpoint_output = checkpoints.split2.get(**wildcards).output[0]
-    files_names = expand(path.join(OUTPUT_DIR, "rescreening_{{percentage}}", "{{name}}_{{receptorID}}","{{name}}.rec_{i}.txt.pdbqt.gz"),
-        i = glob_wildcards(os.path.join(checkpoint_output, '{i}.pdbqt')).i)
+    files_names = expand(
+        path.join(
+            OUTPUT_DIR,
+            "rescreening_{{percentage}}",
+            "{{name}}_{{receptorID}}",
+            "{{name}}.rec_{i}.txt.pdbqt.gz",
+        ),
+        i=glob_wildcards(os.path.join(checkpoint_output, "{i}.pdbqt")).i,
+    )
     return files_names
 
+
 rule mergeDocking2:
     input:
-        unpack(aggregate_in2)
+        unpack(aggregate_in2),
     output:
-        path.join(OUTPUT_DIR, "output", "rescreening_{percentage}", "{name}_{receptorID}","{name}.pdbqt.gz")
+        path.join(
+            OUTPUT_DIR,
+            "output",
+            "rescreening_{percentage}",
+            "{name}_{receptorID}",
+            "{name}.pdbqt.gz",
+        ),
     log:
-        "logs/mergeDocking2_{name}_{receptorID}_{percentage}.log"
+        "logs/mergeDocking2_{name}_{receptorID}_{percentage}.log",
     shell:
-        '''
+        """
             cat {input} >> {output}
-        '''
+        """
+
 
 rule dockingResults2:
     input:
-        path.join(OUTPUT_DIR, "output", "rescreening_{percentage}","{name}_{receptorID}","{name}.pdbqt.gz")
+        path.join(
+            OUTPUT_DIR,
+            "output",
+            "rescreening_{percentage}",
+            "{name}_{receptorID}",
+            "{name}.pdbqt.gz",
+        ),
     output:
-        path.join(OUTPUT_DIR,"output","rescreening_{percentage}","{name}_{receptorID}","{name}_best.pdbqt")
+        path.join(
+            OUTPUT_DIR,
+            "output",
+            "rescreening_{percentage}",
+            "{name}_{receptorID}",
+            "{name}_best.pdbqt",
+        ),
     envmodules:
-        config["PYTHON"]
+        config["PYTHON"],
     log:
-        "logs/dockingResults2_{name}_{receptorID}_{percentage}.log"
+        "logs/dockingResults2_{name}_{receptorID}_{percentage}.log",
     script:
         "../scripts/sortResult.py"
 
+
 rule makeVenn:
     input:
-        best = path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"),
-        re_results = expand(path.join(OUTPUT_DIR,"output","rescreening_{percentage}","{name}_{receptorID}","{name}_best.pdbqt"),
-            percentage = config["RESULT_NUMBER"],
-            receptorID = config["TARGETS"][0].split(',')[0],
-            name = targetList)
+        best=path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"),
+        re_results=expand(
+            path.join(
+                OUTPUT_DIR,
+                "output",
+                "rescreening_{percentage}",
+                "{name}_{receptorID}",
+                "{name}_best.pdbqt",
+            ),
+            percentage=config["RESULT_NUMBER"],
+            receptorID=config["TARGETS"][0].split(",")[0],
+            name=targetList,
+        ),
     output:
-        report(path.join(OUTPUT_DIR,"results","rescreening_{percentage}","{receptorID}","union.csv"), category="Rescreening"),
-        report(path.join(OUTPUT_DIR,"results","rescreening_{percentage}","{receptorID}","venn.png"), category="Rescreening")
+        report(
+            path.join(
+                OUTPUT_DIR,
+                "results",
+                "rescreening_{percentage}",
+                "{receptorID}",
+                "union.csv",
+            ),
+            category="Rescreening",
+        ),
+        report(
+            path.join(
+                OUTPUT_DIR,
+                "results",
+                "rescreening_{percentage}",
+                "{receptorID}",
+                "venn.png",
+            ),
+            category="Rescreening",
+        ),
     log:
-        "logs/makeVenn_{receptorID}_{percentage}.log"
+        "logs/makeVenn_{receptorID}_{percentage}.log",
     script:
         "../scripts/union_venn.py"
diff --git a/workflow/rules/docking.smk b/workflow/rules/docking.smk
index 505bc38..c1ed1ac 100644
--- a/workflow/rules/docking.smk
+++ b/workflow/rules/docking.smk
@@ -1,8 +1,8 @@
 def get_spacing(gridfile):
-    #"""
-    #function to retrieve the 'spacing' (scale) of a grid file
-    #"""
-    #with open(gridfile) as infile:
+    # """
+    # function to retrieve the 'spacing' (scale) of a grid file
+    # """
+    # with open(gridfile) as infile:
     #    for line in gridfile:
     #        match = re.findall("spacing", line)
     #        if match:
@@ -12,28 +12,34 @@ def get_spacing(gridfile):
     #                raise WorkflowError("unable to convert spacing to float from grid file: {gridfile}")
     #             return spacing
     pass
-            
+
 
 rule docking:
     input:
-        receptor = path.join(OUTPUT_DIR,"receptor","{receptorID}.txt"),
-        geometry = path.join(OUTPUT_DIR,"grid","{receptorID}_grid.txt"),
-        ligands = path.join(OUTPUT_DIR,"library","{database}_{dataset}_{name}_{i}.txt")
+        receptor=path.join(OUTPUT_DIR, "receptor", "{receptorID}.txt"),
+        geometry=path.join(OUTPUT_DIR, "grid", "{receptorID}_grid.txt"),
+        ligands=path.join(OUTPUT_DIR, "library", "{database}_{dataset}_{name}_{i}.txt"),
     output:
-        path.join(OUTPUT_DIR, "output","{receptorID}","{dataset}","{receptorID}.txt_{database}_{dataset}_{name}_{i}.txt.pdbqt.gz")
+        path.join(
+            OUTPUT_DIR,
+            "output",
+            "{receptorID}",
+            "{dataset}",
+            "{receptorID}.txt_{database}_{dataset}_{name}_{i}.txt.pdbqt.gz",
+        ),
     envmodules:
-        config["VINALC"]
+        config["VINALC"],
     params:
-        dir = path.join(OUTPUT_DIR, "output"),
-        space = get_spacing(path.join(config["GRID_DIR"], "{receptorID}.gpf")),
-        errorDir = path.join(OUTPUT_DIR, "errorLigands.txt"),
+        dir=path.join(OUTPUT_DIR, "output"),
+        space=get_spacing(path.join(config["GRID_DIR"], "{receptorID}.gpf")),
+        errorDir=path.join(OUTPUT_DIR, "errorLigands.txt"),
     resources:
-        partition = config["DOCKING"]["partition"],
-        runtime = config["DOCKING"]["runtime"],
-        slurm_extra = config["DOCKING"]["slurm_extra"],
-        mpi = "srun",
-        mem_mb_per_cpu = config["DOCKING"]["mem_mb_per_cpu"],
-        ntasks = config["DOCKING"]["ntasks"]
+        partition=config["DOCKING"]["partition"],
+        runtime=config["DOCKING"]["runtime"],
+        slurm_extra=config["DOCKING"]["slurm_extra"],
+        mpi="srun",
+        mem_mb_per_cpu=config["DOCKING"]["mem_mb_per_cpu"],
+        ntasks=config["DOCKING"]["ntasks"],
     shell:
         """(
         mkdir -p {params.dir}/{wildcards.receptorID}/{wildcards.dataset}
@@ -45,16 +51,31 @@ rule docking:
         cd -
         )"""
 
+
 def aggregate_in(wildcards):
     checkpoint_output = checkpoints.split.get(**wildcards).output[0]
-    files_names = expand(path.join(OUTPUT_DIR,"output","{{receptorID}}", "{{dataset}}", "{{receptorID}}.txt_{{database}}_{{dataset}}_{{name}}_{i}.txt.pdbqt.gz"),
-        i = glob_wildcards(os.path.join(checkpoint_output, "{i}.pdbqt")).i)
+    files_names = expand(
+        path.join(
+            OUTPUT_DIR,
+            "output",
+            "{{receptorID}}",
+            "{{dataset}}",
+            "{{receptorID}}.txt_{{database}}_{{dataset}}_{{name}}_{i}.txt.pdbqt.gz",
+        ),
+        i=glob_wildcards(os.path.join(checkpoint_output, "{i}.pdbqt")).i,
+    )
     return files_names
 
+
 rule mergeDocking:
     input:
-        unpack(aggregate_in)
+        unpack(aggregate_in),
     output:
-        path.join(OUTPUT_DIR, "output", "{receptorID}", "{receptorID}_{database}_{dataset}_{name}.pdbqt.gz")
+        path.join(
+            OUTPUT_DIR,
+            "output",
+            "{receptorID}",
+            "{receptorID}_{database}_{dataset}_{name}.pdbqt.gz",
+        ),
     script:
         "../scripts/mergeOutput.py"
diff --git a/workflow/rules/preparation.smk b/workflow/rules/preparation.smk
index 1e3a3da..303b2b8 100644
--- a/workflow/rules/preparation.smk
+++ b/workflow/rules/preparation.smk
@@ -1,144 +1,171 @@
 ruleorder: convertMol2 > gunzip
 
+
 rule targetProtein:
     output:
-        path.join(INPUT_DIR, "PDB","receptor","{receptorID}.pdb.gz")
+        path.join(INPUT_DIR, "PDB", "receptor", "{receptorID}.pdb.gz"),
     shell:
         "curl -o {output} {config[TARGET_URL]}/{wildcards.receptorID}.pdb.gz"
 
+
 rule getZINCdata:
     output:
-        path.join(INPUT_DIR, "ZINC","{dataset}","{name}.pdbqt.gz")
+        path.join(INPUT_DIR, "ZINC", "{dataset}", "{name}.pdbqt.gz"),
     script:
         "../scripts/ZINCdownload.py"
 
+
 rule getZINCSubsets:
-    params: sub = config["SUBSET"]
+    params:
+        sub=config["SUBSET"],
     output:
-        expand(path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2"), subset=config["SUBSET"])
+        expand(
+            path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2"),
+            subset=config["SUBSET"],
+        ),
     script:
         "../scripts/downloadZINCsubsets.py"
 
+
 rule convertMol2:
     input:
-        path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2")
+        path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2"),
     output:
-        path.join(TMP_DIR,"unzipped", "ZINC","subsets","{subset}.pdbqt")
+        path.join(TMP_DIR, "unzipped", "ZINC", "subsets", "{subset}.pdbqt"),
     envmodules:
-        config["OPENBABEL"]
+        config["OPENBABEL"],
     shell:
         "obabel {input} -opdbqt -O {output}"
 
+
 rule mergeLocalInput:
-    params: in_dir = LOCAL_INPUT
+    params:
+        in_dir=LOCAL_INPUT,
     output:
-        path.join(TMP_DIR,"unzipped", "{database}","{dataset}","local.pdbqt")
+        path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "local.pdbqt"),
     envmodules:
-        config["OPENBABEL"]
+        config["OPENBABEL"],
     script:
         "../scripts/concatPDBQT.py"
 
+
 rule ENAMINEdownload:
     output:
-        expand(path.join(INPUT_DIR, "ENAMINE", "{ENAMINE_collection}"), ENAMINE_collection=config["ENAMINE_INPUT"])
+        expand(
+            path.join(INPUT_DIR, "ENAMINE", "{ENAMINE_collection}"),
+            ENAMINE_collection=config["ENAMINE_INPUT"],
+        ),
     script:
         "../scripts/ENAMINEdownload.py"
 
+
 rule SDFToPDBQT:
     input:
-        path.join(TMP_DIR,"unzipped", "{database}","{dataset}","{name}.sdf")
+        path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.sdf"),
     output:
-        path.join(TMP_DIR,"unzipped" ,"{database}","{dataset}","{name}.pdbqt")
+        path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.pdbqt"),
     envmodules:
-        config["OPENBABEL"]
+        config["OPENBABEL"],
     shell:
         "obabel {input} -opdbqt -O {output}"
 
+
 rule prepareReceptor:
     input:
-        path.join(TMP_DIR, "unzipped", "PDB", "receptor", "{name}.pdb")
+        path.join(TMP_DIR, "unzipped", "PDB", "receptor", "{name}.pdb"),
     output:
-        path.join(TMP_DIR ,"PDB", "receptor", "{name}.pdb")
+        path.join(TMP_DIR, "PDB", "receptor", "{name}.pdb"),
     envmodules:
-        config["BIOPYTHON"]
+        config["BIOPYTHON"],
     script:
         "../scripts/prepareReceptor.py"
 
+
 rule makeReceptorPDBQT:
     input:
-        path.join(TMP_DIR, "PDB", "receptor", "{name}.pdb")
+        path.join(TMP_DIR, "PDB", "receptor", "{name}.pdb"),
     output:
-        path.join(PREPARED_DIR,"receptor", "{name}.pdbqt")
+        path.join(PREPARED_DIR, "receptor", "{name}.pdbqt"),
     envmodules:
-        config["OPENBABEL"]
+        config["OPENBABEL"],
     shell:
         "obabel -ipdb {input} -opdbqt -O {output} -xr"
 
 
 rule gunzip:
     input:
-        path.join(INPUT_DIR, "{database}","{dataset}","{name}.{filetype}.gz")
+        path.join(INPUT_DIR, "{database}", "{dataset}", "{name}.{filetype}.gz"),
     output:
-        path.join(TMP_DIR,"unzipped", "{database}","{dataset}","{name}.{filetype}")
+        path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.{filetype}"),
     shell:
         "gunzip < {input} > {output} || touch {output}"
 
+
 rule cleanLigands:
     input:
-        path.join(TMP_DIR,"unzipped", "{database}","{dataset}","{name}.pdbqt")
+        path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.pdbqt"),
     output:
-        path.join(TMP_DIR, "cleaned", "{database}","{dataset}","{name}.pdbqt")
+        path.join(TMP_DIR, "cleaned", "{database}", "{dataset}", "{name}.pdbqt"),
     shell:
         "awk '/MODEL/ {{s=1}} s {{a[++c]=$0}} /Si/ {{p=1}} /ENDMDL/ {{if (!p) for (i=1;i<=c;i++) print a[i]; delete a;s=p=c=0;next}} !s' {input} > {output}"
 
+
 checkpoint split:
     input:
-        path.join(TMP_DIR, "cleaned", "{database}","{dataset}","{name}.pdbqt")
+        path.join(TMP_DIR, "cleaned", "{database}", "{dataset}", "{name}.pdbqt"),
     output:
-        directory(os.path.join(TMP_DIR, "prepared", "{database}","{dataset}","{name}_dir"))
+        directory(
+            os.path.join(TMP_DIR, "prepared", "{database}", "{dataset}", "{name}_dir")
+        ),
     script:
         "../scripts/splitFile.py"
 
+
 rule energyMin:
     input:
-        path.join(TMP_DIR, "prepared", "{database}","{dataset}","{name}_dir","{i}.pdbqt")
+        path.join(
+            TMP_DIR, "prepared", "{database}", "{dataset}", "{name}_dir", "{i}.pdbqt"
+        ),
     output:
-        path.join(MIN_DIR,"{database}","{dataset}","{name}","{i}.pdbqt")
+        path.join(MIN_DIR, "{database}", "{dataset}", "{name}", "{i}.pdbqt"),
     params:
-        algorithm = config["ENERGY_MIN_ALGORITHM"],
-        steps = config["STEPS"],
-        forcefield = config["FORCEFIELD"],
-        convergence = config["CONVERGENCE_CRITERIA"]
+        algorithm=config["ENERGY_MIN_ALGORITHM"],
+        steps=config["STEPS"],
+        forcefield=config["FORCEFIELD"],
+        convergence=config["CONVERGENCE_CRITERIA"],
     threads: config["ENERGY_MIN"]["threads"]
     resources:
-        partition = config["ENERGY_MIN"]["partition"],
-        runtime = config["ENERGY_MIN"]["runtime"],
-        mem_mb = config["ENERGY_MIN"]["mem_mb"],
+        partition=config["ENERGY_MIN"]["partition"],
+        runtime=config["ENERGY_MIN"]["runtime"],
+        mem_mb=config["ENERGY_MIN"]["mem_mb"],
     envmodules:
-        config["OPENBABEL"]
+        config["OPENBABEL"],
     shell:
         "obabel -ipdbqt {input} -opdbqt -O {output} --minimize --{params.algorithm} --steps {params.steps} --ff {params.forcefield} --crit {params.convergence}"
 
+
 rule prepareGeometry:
     input:
-        path.join(config["GRID_DIR"],"{receptorID}.gpf")
+        path.join(config["GRID_DIR"], "{receptorID}.gpf"),
     output:
-        path.join(OUTPUT_DIR,"grid","{receptorID}_grid.txt")
+        path.join(OUTPUT_DIR, "grid", "{receptorID}_grid.txt"),
     shell:
         "egrep 'npts|gridcenter' {input} |cut -f2-4 -d' '| tac |tr '\n' ' ' > {output} && sed -i -e '$a\ ' {output}"
 
+
 rule prepareLibrary:
     input:
-        path.join(MIN_DIR,"{database}","{dataset}","{name}","{i}.pdbqt")
+        path.join(MIN_DIR, "{database}", "{dataset}", "{name}", "{i}.pdbqt"),
     output:
-        library=path.join(OUTPUT_DIR,"library","{database}_{dataset}_{name}_{i}.txt")
+        library=path.join(OUTPUT_DIR, "library", "{database}_{dataset}_{name}_{i}.txt"),
     shell:
         "echo {input} > {output}"
 
+
 rule prepareDocking:
     input:
-        path.join(PREPARED_DIR, "receptor", "{receptorID}.pdbqt")
+        path.join(PREPARED_DIR, "receptor", "{receptorID}.pdbqt"),
     output:
-        path.join(OUTPUT_DIR,"receptor","{receptorID}.txt")
+        path.join(OUTPUT_DIR, "receptor", "{receptorID}.txt"),
     shell:
         "echo {input} > {output}"