@@ -15,46 +15,46 @@ def bisect_threshold(lib_path, data_path, target_count=500):
1515 The input directory should contain the graph database as .lg file, the number of graphs in the file as .count file.
1616 It is assumed, that the count files are enumerated the same way as the graph databases, so that a lexicographic sorting gives matching files.'''
1717 target_count = int (target_count )
18-
18+
1919 # First check, that we have as many count-files as graph files.
2020 db_files = sorted ([file_name for file_name in os .listdir (data_path ) if file_name .endswith ('.aids' )])
2121 count_files = sorted ([file_name for file_name in os .listdir (data_path ) if file_name .endswith ('.count' )])
2222 assert len (db_files ) == len (count_files )
23-
24-
23+
24+
2525 for idx , in_file in enumerate (db_files ):
2626 match_filename = re .match (regex_count_file , in_file )
27-
27+
2828 if not match_filename :
2929 assert False , "Filename for db_size not formatted as expected."
30-
31-
30+
31+
3232 database_id = int (match_filename .group (1 ))
33-
34-
33+
34+
3535 t_max = get_db_size (data_path + count_files [idx ])
3636 t_min = 2
37-
37+
3838 # Bisection to find a heuristically good threshold
3939 while t_max - t_min > 1 :
4040 t = (t_max + t_min ) // 2
4141 found_patterns = run_approximate (lib_path , data_path + in_file , data_path + 'frequent_temp.cstring' , t )
42-
42+
4343 if found_patterns >= target_count :
44- t_min = t
44+ t_min = t
4545 else :
4646 t_max = t
47-
47+
4848
4949 # Write found threshold to file
5050 with open (data_path + str (database_id ) + '.threshold' , 'w' ) as f :
51- f .write (str (t ))
52-
53-
51+ f .write (str (t ))
52+
53+
5454def get_db_size (file_name ):
5555 with open (file_name ) as f :
5656 return int (f .read ())
57-
57+
5858def count_subgraphs (subgraph_file ):
5959 ''' Assume a AIDS format. In this format, every graph is represented by 3 lines.'''
6060 i = None
@@ -65,34 +65,34 @@ def count_subgraphs(subgraph_file):
6565 return ceil ((i + 1 ) / 3 )
6666 else :
6767 return 0
68-
68+
6969def run_approximate (lib_path , input_file , output_file , threshold ):
7070 ''' We fix the maximum length to l, i.e., we are mining patterns at most l nodes large. '''
71-
71+
7272 # Run HOPS approximate subgraph miner
7373 lwg_cmd_template = "'{lib_path}lwg' -t {threshold} -p 8 -e hops -i 5 '{input_file}' -o '{output_file}'"
7474 miner_cmd = lwg_cmd_template .format (lib_path = lib_path , input_file = input_file , output_file = output_file , threshold = threshold )
75-
75+
7676 p = subprocess .Popen (miner_cmd , shell = True )
77-
77+
7878 try :
7979 p .wait (30 ) # Should take at most 30 seconds
8080 except Exception as e :
8181 print (str (e ))
82- p .kill ()
83-
82+ p .kill ()
83+
8484 # transform output file (so that we can easier count the number of patterns later
8585 cstring_cmd = "cat " + output_file + " | xargs -I {} bash -c \" echo '{}' | '" + lib_path + "cstring' -i -\" > " + output_file + ".tmp"
86-
86+
8787 subprocess .run (cstring_cmd , shell = True )
8888
8989 #os.remove(output_file)
9090 nb_subgraphs = count_subgraphs (output_file + ".tmp" )
9191 #os.remove(output_file+".tmp")
9292 return nb_subgraphs
93-
93+
9494if __name__ == "__main__" :
9595 if len (sys .argv ) == 4 :
96- bisect_threshold (sys .argv [1 ], sys .argv [2 ], target_count = sys .argv [3 ])
96+ bisect_threshold (sys .argv [1 ], sys .argv [2 ], target_count = sys .argv [3 ])
9797 else :
9898 print ("Unexpected number of arguments. Run as python bisect_threshold_search.py [lib_path] [data_path] [target_count]. lwg and cstring tool binaries need to be located in the lib_path. Data directiory is expected to contain .lg line graph databases and .count files with the number of graphs in the corresponding database." )
0 commit comments