From edfc8c62745c5dc4570d8abf03b9b75f83fa1c35 Mon Sep 17 00:00:00 2001 From: Manu-chroma Date: Sun, 2 Apr 2017 17:30:49 +0530 Subject: [PATCH 1/3] working towards py3 compatibility - fixed imports - print statements --- .travis.yml | 4 +- setup.cfg | 2 +- test/test_chat.py | 3 +- wp_parser/ChatFeatures.py | 169 +++++++++++++++--------------- wp_parser/datelib.py | 15 +-- wp_parser/parsers/facebook.py | 9 +- wp_parser/parsers/facebook.py.bak | 29 +++++ wp_parser/parsers/message.py | 15 ++- wp_parser/parsers/whatsapp.py | 8 +- wp_parser/parsers/whatsapp.py.bak | 49 +++++++++ wp_parser/wp_chat.py | 127 +++++++++++----------- 11 files changed, 257 insertions(+), 173 deletions(-) create mode 100644 wp_parser/parsers/facebook.py.bak create mode 100644 wp_parser/parsers/whatsapp.py.bak diff --git a/.travis.yml b/.travis.yml index b0c75b7..5738124 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,8 @@ language: python python: - "2.7" -# - "3.4" -# - "3.5" + - "3.4" + - "3.5" # - "3.6" #install: # - pip install . diff --git a/setup.cfg b/setup.cfg index 4364fa9..5490153 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,2 @@ [tool:pytest] -norecursedirs= venv +norecursedirs = venv diff --git a/test/test_chat.py b/test/test_chat.py index 8e5ce24..7f5221d 100644 --- a/test/test_chat.py +++ b/test/test_chat.py @@ -1,10 +1,11 @@ import os class TestChat: + def test_chat(self, tmpdir): out_filename = str(tmpdir.join("abc")) for case in ['One', 'Two']: - cmd = 'python wp_parser/wp_chat.py -f test/testChat2.txt -n Username{} > {}'.format(case, out_filename) + cmd = 'python -p wp_parser/wp_chat.py -f test/testChat2.txt -n Username{} > {}'.format(case, out_filename) os.system(cmd) with open(out_filename) as fh: result = fh.read() diff --git a/wp_parser/ChatFeatures.py b/wp_parser/ChatFeatures.py index 1e754b4..ef0acb3 100644 --- a/wp_parser/ChatFeatures.py +++ b/wp_parser/ChatFeatures.py @@ -1,24 +1,28 @@ # -*- coding: utf-8 -*- from __future__ import division -import datelib -import re +from __future__ import absolute_import + import operator -class ChatFeatures(): +import re +import wp_parser.datelib as datelib + +class ChatFeatures: def __init__(self): - self.root_response_time = [] + self.root_response_time = [] self.contact_response_time = [] - self.root_burst = [] - self.contact_burst = [] - self.initiations = {} - self.weekday = {} - self.shifts = {} - self.patterns = {} - self.proportions = {} - self.most_used_words = {} - - def compute_response_time_and_burst(self, list_of_messages, root_name, senders, initiation_thrs=(60*60*8), burst_thrs=3, response_thrs=(60*60*3)): + self.root_burst = [] + self.contact_burst = [] + self.initiations = {} + self.weekday = {} + self.shifts = {} + self.patterns = {} + self.proportions = {} + self.most_used_words = {} + + def compute_response_time_and_burst(self, list_of_messages, root_name, senders, initiation_thrs=(60 * 60 * 8), + burst_thrs=3, response_thrs=(60 * 60 * 3)): # perform the operations that are dependant on multiple messages # (response time, bursts) self.initiations = {} @@ -27,7 +31,7 @@ def compute_response_time_and_burst(self, list_of_messages, root_name, senders, t0 = list_of_messages[0].datetime_obj burst_count = 1 for index, message in enumerate(list_of_messages): - #skip the first message since we are looking at differences; note this means we don't count first msg as init + # skip the first message since we are looking at differences; note this means we don't count first msg as init if index == 0: continue t1 = message.datetime_obj @@ -35,22 +39,22 @@ def compute_response_time_and_burst(self, list_of_messages, root_name, senders, dt.total_seconds() # print "sender %s delta %s" % ( message.sender, dt.total_seconds() ) - if (dt.total_seconds() > initiation_thrs): + if dt.total_seconds() > initiation_thrs: self.initiations[message.sender] += 1 # is sender the same as the last message? - if message.sender != list_of_messages[index-1].sender: + if message.sender != list_of_messages[index - 1].sender: # sender changed, store the burst count and reset - #print "sender changed: %s" % ( message.sender ) - #print "burst count: %s" % ( burst_count ) + # print "sender changed: %s" % ( message.sender ) + # print "burst count: %s" % ( burst_count ) - #print("response time: %d\n" %(dt.total_seconds()) ) + # print("response time: %d\n" %(dt.total_seconds()) ) # is sender the root? if message.sender == root_name: # store the burst count for the last sender, which is the # opposite of current if burst_count > burst_thrs: - #print "BURST CONTACT ENDED: %s IN A ROW" % ( burst_count ) + # print "BURST CONTACT ENDED: %s IN A ROW" % ( burst_count ) self.contact_burst.append(burst_count) if dt.total_seconds() < response_thrs: self.root_response_time.append(dt.total_seconds()) @@ -59,11 +63,11 @@ def compute_response_time_and_burst(self, list_of_messages, root_name, senders, # store the burst count for the last sender, which is the # opposite of current if burst_count > burst_thrs: - #print "BURST ROOT ENDED: %s IN A ROW" % ( burst_count ) + # print "BURST ROOT ENDED: %s IN A ROW" % ( burst_count ) self.root_burst.append(burst_count) if dt.total_seconds() < response_thrs: self.contact_response_time.append(dt.total_seconds()) - + # End of the first burst, restart the counter burst_count = 1 @@ -71,12 +75,12 @@ def compute_response_time_and_burst(self, list_of_messages, root_name, senders, # accumulate the number of messages sent in a row burst_count += 1 t0 = t1 - if burst_count > burst_thrs: #catch a burst if at end of chat - #print "final burst: %s" % ( burst_count ) - if message.sender == root_name: + if burst_count > burst_thrs: # catch a burst if at end of chat + # print "final burst: %s" % ( burst_count ) + if message.sender == root_name: self.root_burst.append(burst_count) else: - self.contact_burst.append(burst_count) + self.contact_burst.append(burst_count) def compute_messages_per_weekday(self, list_of_messages): self.weekday = { @@ -105,16 +109,16 @@ def compute_messages_per_shift(self, list_of_messages): } for msg in list_of_messages: hour = int(msg.time.split(":")[0]) - if hour >= 0 and hour <= 6: + if 0 <= hour <= 6: self.shifts["latenight"] += 1 - elif hour > 6 and hour <= 11: + elif 6 < hour <= 11: self.shifts["morning"] += 1 - elif hour > 11 and hour <= 17: + elif 11 < hour <= 17: self.shifts["afternoon"] += 1 - elif hour > 17 and hour <= 23: + elif 17 < hour <= 23: self.shifts["evening"] += 1 return self.shifts @@ -134,7 +138,7 @@ def compute_messages_pattern(self, list_of_messages, senders, pattern_list): if length > 0: if pattern not in self.patterns: self.patterns[pattern][msg.sender] = length - print "This should never happen" + print("This should never happen") else: self.patterns[pattern][msg.sender] += length return self.patterns @@ -149,10 +153,10 @@ def compute_message_proportions(self, list_of_messages, senders, root, contact): self.proportions[i][s] = 0 for msg in list_of_messages: self.proportions["messages"][msg.sender] += 1 - self.proportions["words"][msg.sender] += len(msg.content.split(" ")) - self.proportions["chars"][msg.sender] += len(msg.content.strip()) - self.proportions["qmarks"][msg.sender] += msg.content.count('?') - self.proportions["exclams"][msg.sender] += msg.content.count('!') + self.proportions["words"][msg.sender] += len(msg.content.split(" ")) + self.proportions["chars"][msg.sender] += len(msg.content.strip()) + self.proportions["qmarks"][msg.sender] += msg.content.count('?') + self.proportions["exclams"][msg.sender] += msg.content.count('!') self.proportions["media"][msg.sender] += ( msg.content.count('') + msg.content.count('') + @@ -170,24 +174,24 @@ def compute_message_proportions(self, list_of_messages, senders, root, contact): self.proportions["avg_words"] = {} for s in senders: self.proportions["avg_words"][s] = self.proportions["words"][s] / self.proportions["messages"][s] - self.proportions["avg_words"]["ratio"] = self.proportions["avg_words"][root] / self.proportions["avg_words"][contact] + self.proportions["avg_words"]["ratio"] = self.proportions["avg_words"][root] / self.proportions["avg_words"][ + contact] for c in categories: self.proportions[c]["total"] = 0 for s in senders: self.proportions[c]["total"] += self.proportions[c][s] - + for c in categories: - - #if a value is 0, replace with a 1 to avoid zero erros in ratio calcs. + + # if a value is 0, replace with a 1 to avoid zero erros in ratio calcs. if self.proportions[c][contact] == 0: self.proportions[c][contact] = 1 if self.proportions[c][root] == 0: - self.proportions[c][root] = 1 + self.proportions[c][root] = 1 self.proportions[c]["ratio"] = self.proportions[c][root] / self.proportions[c][contact] - return self.proportions def compute_most_used_words(self, list_of_messages, top=10, threshold=3): @@ -204,37 +208,37 @@ def compute_most_used_words(self, list_of_messages, top=10, threshold=3): words_counter[w] = 1 else: words_counter[w] += 1 - sorted_words = sorted(words_counter.iteritems(), key=operator.itemgetter(1), reverse=True) + sorted_words = sorted(words_counter.items(), key=operator.itemgetter(1), reverse=True) self.most_used_words = sorted_words[:top] return self.most_used_words def compute_avg_root_response_time(self): - if (len(self.root_response_time) != 0): - return sum(self.root_response_time)/len(self.root_response_time) + if len(self.root_response_time) != 0: + return sum(self.root_response_time) / len(self.root_response_time) return 0 def compute_avg_contact_response_time(self): - if (len(self.contact_response_time) != 0): - return sum(self.contact_response_time)/len(self.contact_response_time) + if len(self.contact_response_time) != 0: + return sum(self.contact_response_time) / len(self.contact_response_time) return 0 def compute_response_time_ratio(self, root, contact): avg_root = self.compute_avg_root_response_time() avg_contact = self.compute_avg_contact_response_time() - if (avg_contact != 0): + if avg_contact != 0: return avg_root / avg_contact return 0 def compute_bursts_ratio(self, root, contact): if (len(self.contact_burst)) == 0: return len(self.root_burst) / 1 - if (len(self.root_burst) == 0): - return ( 1/len(self.contact_burst)) - return len(self.root_burst)/len(self.contact_burst) + if len(self.root_burst) == 0: + return 1 / len(self.contact_burst) + return len(self.root_burst) / len(self.contact_burst) def compute_nbr_root_burst(self): return len(self.root_burst) - + def compute_nbr_contact_burst(self): return len(self.contact_burst) @@ -244,48 +248,41 @@ def compute_nbr_contact_burst(self): # return 0 def compute_avg_contact_burst(self): - if (len(self.contact_burst) != 0): - return sum(self.contact_burst)/len(self.contact_burst) + if len(self.contact_burst) != 0: + return sum(self.contact_burst) / len(self.contact_burst) return 0 def compute_root_initation_ratio(self, root, contact): - if (self.initiations[contact] == 0): - return self.initiations[root]/1 - if (self.initiations[root] == 0): - return 1/self.initiations[contact] + if self.initiations[contact] == 0: + return self.initiations[root] / 1 + if self.initiations[root] == 0: + return 1 / self.initiations[contact] return self.initiations[root] / self.initiations[contact] - + def generate_outcome(self, root, contact, methodology): - outcome = 99; + outcome = 99 if methodology == 0: - if (self.compute_root_initation_ratio(root, contact) > 0.867): - outcome = 0 #"just not that into you" - #print "DOESNT INITIATE" - elif (self.proportions["qmarks"]["ratio"] > 0.87): #flipped the non-intutitive direction of inequality - outcome = 0 #"just not that into you" - #print "QUESTIONS FAIL" + if self.compute_root_initation_ratio(root, contact) > 0.867: + outcome = 0 # "just not that into you" + # print "DOESNT INITIATE" + elif self.proportions["qmarks"]["ratio"] > 0.87: # flipped the non-intuitive direction of inequality + outcome = 0 # "just not that into you" + # print "QUESTIONS FAIL" else: - outcome = 1 #"definitely into you" - #print "ELSE" + outcome = 1 # "definitely into you" + # print "ELSE" elif methodology == 1: - if (self.compute_root_initation_ratio(root, contact) > 0.83): - outcome = 0 #"just not that into you" - #print "DOESNT INITIATE" - elif (self.features.compute_avg_root_response_time() < 0.92): #flipped non-intuitive direction of inequality - outcome = 0 #"just not that into you" - #print "QUESTIONS FAIL" + if self.compute_root_initation_ratio(root, contact) > 0.83: + outcome = 0 # "just not that into you" + # print "DOESNT INITIATE" + elif self.features.compute_avg_root_response_time() < 0.92: # flipped non-intuitive direction of inequality + outcome = 0 # "just not that into you" + # print "QUESTIONS FAIL" else: - outcome = 1 #"definitely into you" - #print "ELSE" + outcome = 1 # "definitely into you" + # print "ELSE" else: - outcome = 99; - - return outcome - -# qMarksPerRoot = qmarksRoot/messagesRoot - # qMarksPerContact = qmarksContact/messagesContact - - - - \ No newline at end of file + outcome = 99 + + return outcome diff --git a/wp_parser/datelib.py b/wp_parser/datelib.py index 33f69d6..a540fd3 100644 --- a/wp_parser/datelib.py +++ b/wp_parser/datelib.py @@ -1,7 +1,7 @@ +import time from datetime import date from datetime import datetime from datetime import timedelta -import time # get current ymd @@ -37,11 +37,13 @@ def valid_date(date_str): return valid + def date_diff(dateobj1, dateobj2): import math delta = dateobj2 - dateobj1 return int(math.fabs(delta.days)) + def datecmp(date1, date2): year, month, day = date_split(date1) year_t, month_t, day_t = date_split(date2) @@ -53,8 +55,8 @@ def datecmp(date1, date2): else: return 1 except ValueError: - #misc.error("Fix me! Invalid date", "datecmp") - print "Fix me! Invalid date" + # misc.error("Fix me! Invalid date", "datecmp") + print("Fix me! Invalid date") return False @@ -65,7 +67,7 @@ def date_operation(date_str, num): return end_date -def date_to_str(date_str): +def date_to_str(): return date.strftime('%Y-%m-%d') @@ -89,7 +91,7 @@ def date_interval(initial_date, length, step=1, separator="-"): output = [] current = start_date while current < end_date: - output.append(date_to_str(current)) + output.append(date_to_str()) current += timedelta(days=step) return output @@ -119,5 +121,6 @@ def weekday_portuguese_to_english(string): elif string == "sab" or string == "sabado": return "Saturday" + if __name__ == "__main__": - print date_diff(datetime(2015, 6, 4), datetime(2015, 07, 7)) \ No newline at end of file + print(date_diff(datetime(2015, 6, 4), datetime(2015, 7, 7))) diff --git a/wp_parser/parsers/facebook.py b/wp_parser/parsers/facebook.py index 2d93c9a..c6fea13 100644 --- a/wp_parser/parsers/facebook.py +++ b/wp_parser/parsers/facebook.py @@ -1,12 +1,13 @@ from datetime import datetime -import message -class ParserFacebook(): +from . import message - ''' A line is a dict object in this format: + +class ParserFacebook: + """ A line is a dict object in this format: {u'message': u'text text', u'from': u'Username One', u'id': u'3294659605566648_1432085429', u'datetime': u'2015-05-20T01:30:29+0000'} - ''' + """ def __init__(self, raw_messages): self.raw_messages = raw_messages diff --git a/wp_parser/parsers/facebook.py.bak b/wp_parser/parsers/facebook.py.bak new file mode 100644 index 0000000..2d93c9a --- /dev/null +++ b/wp_parser/parsers/facebook.py.bak @@ -0,0 +1,29 @@ +from datetime import datetime +import message + +class ParserFacebook(): + + ''' A line is a dict object in this format: + {u'message': u'text text', u'from': u'Username One', u'id': + u'3294659605566648_1432085429', u'datetime': u'2015-05-20T01:30:29+0000'} + ''' + + def __init__(self, raw_messages): + self.raw_messages = raw_messages + + def parse(self): + list_of_messages = [] + set_of_senders = set() + for l in self.raw_messages: + content = l["message"].encode("utf-8") + sender = l["from"].encode("utf-8") + datetime_str = l["datetime"].encode("utf-8") + date, time = datetime_str.split("T") + time = time.replace("+0000", "") + msg_date = date + " " + time + datetime_obj = datetime.strptime(msg_date, "%Y-%m-%d %H:%M:%S") + + set_of_senders.add(sender) + list_of_messages.append(message.Message(sender, content, date, time, datetime_obj)) + + return list(set_of_senders), list_of_messages diff --git a/wp_parser/parsers/message.py b/wp_parser/parsers/message.py index b69afd8..53975a1 100644 --- a/wp_parser/parsers/message.py +++ b/wp_parser/parsers/message.py @@ -1,11 +1,10 @@ -class Message(): - +class Message: def __init__(self, sender, content, date, time, datetime_obj): - self.sender = sender - self.content = content - self.date = date - self.time = time - self.datetime_obj = datetime_obj + self.sender = sender + self.content = content + self.date = date + self.time = time + self.datetime_obj = datetime_obj def __repr__(self): - return " ".join(str(v) for v in [self.datetime_obj, self.sender, self.content]) \ No newline at end of file + return " ".join(str(v) for v in [self.datetime_obj, self.sender, self.content]) diff --git a/wp_parser/parsers/whatsapp.py b/wp_parser/parsers/whatsapp.py index 8eddc43..7d25998 100644 --- a/wp_parser/parsers/whatsapp.py +++ b/wp_parser/parsers/whatsapp.py @@ -1,13 +1,15 @@ from datetime import datetime -import message -''' A line can be either: +from . import message + +''' A line can be either: 09/12/2012 17:03:48: Sender Name: Message 3/24/14, 1:59:59 PM: Sender Name: Message 24/3/14, 13:59:59: Sender Name: Message ''' -class ParserWhatsapp(): + +class ParserWhatsapp: def __init__(self, raw_messages): self.raw_messages = raw_messages diff --git a/wp_parser/parsers/whatsapp.py.bak b/wp_parser/parsers/whatsapp.py.bak new file mode 100644 index 0000000..8eddc43 --- /dev/null +++ b/wp_parser/parsers/whatsapp.py.bak @@ -0,0 +1,49 @@ +from datetime import datetime +import message + +''' A line can be either: + 09/12/2012 17:03:48: Sender Name: Message + 3/24/14, 1:59:59 PM: Sender Name: Message + 24/3/14, 13:59:59: Sender Name: Message +''' + +class ParserWhatsapp(): + + def __init__(self, raw_messages): + self.raw_messages = raw_messages + + def parse(self): + list_of_messages = [] + set_of_senders = set() + for l in self.raw_messages: + msg_date, sep, msg = l.partition(": ") + raw_date, sep, time = msg_date.partition(" ") + sender, sep, content = msg.partition(": ") + raw_date = raw_date.replace(",", "") + year = raw_date.split(" ")[0].split("/")[-1] + # The following lines treats: + # 3/24/14 1:59:59 PM + # 24/3/14 13:59:59 PM + # Couldn't we use msg_date instead of chatTimeString here? + + # colonIndex = [x.start() for x in re.finditer(':', l)] + # print l, colonIndex + # chatTimeString = l[0:colonIndex[2]] + # This ignores a minority of bad formatted lines using try/except block. + # an execption is raised when the datetime_obj is not created due to date parsing error + try: + if "AM" in msg_date or "PM" in msg_date: + datetime_obj = datetime.strptime( + msg_date, "%m/%d/%y, %I:%M:%S %p") + else: + if len(year) == 2: + datetime_obj = datetime.strptime(msg_date, "%m/%d/%y %H:%M:%S") + else: + datetime_obj = datetime.strptime(msg_date, "%m/%d/%Y %H:%M:%S") + except ValueError: + continue + + set_of_senders.add(sender) + list_of_messages.append(message.Message(sender, content, raw_date, time, datetime_obj)) + + return list(set_of_senders), list_of_messages diff --git a/wp_parser/wp_chat.py b/wp_parser/wp_chat.py index 2b76194..c9adea6 100644 --- a/wp_parser/wp_chat.py +++ b/wp_parser/wp_chat.py @@ -2,45 +2,42 @@ # -*- coding: utf-8 -*- from __future__ import division +from __future__ import absolute_import -from parsers import whatsapp, facebook -from ChatFeatures import ChatFeatures - +import argparse +import json +import operator -from datetime import datetime import codecs -import operator -import sys -import json -import csv -import argparse -import os +import os +from wp_parser.ChatFeatures import ChatFeatures +from wp_parser.parsers import whatsapp, facebook def pretty_print(dic, parent, depth): - tup = sorted(dic.iteritems(), key=operator.itemgetter(1)) + tup = sorted(dic.items(), key=operator.itemgetter(1)) isLeaf = True for key in tup: if isinstance(dic[key[0]], dict): isLeaf = False if isLeaf and depth != 0: - print " " * (depth - 1) * 2, parent + print(" " * (depth - 1) * 2, parent) for key in tup: if isinstance(dic[key[0]], dict): pretty_print(dic[key[0]], key[0], depth + 1) else: - print " " * depth * 2, str(key[0]), "->", dic[key[0]] + print(" " * depth * 2, str(key[0]), "->", dic[key[0]]) -class Chat(): +class Chat: def __init__(self, filename, platform="WhatsApp"): - self.filename = filename - self.platform = platform + self.filename = filename + self.platform = platform self.raw_messages = [] - self.messages = [] # List of Messages objects - self.features = ChatFeatures() # Chat Features object - self.senders = [] - self.root = '' + self.messages = [] # List of Messages objects + self.features = ChatFeatures() # Chat Features object + self.senders = [] + self.root = '' if platform == "WhatsApp": self.open_file = self.open_file_whatsapp @@ -98,72 +95,72 @@ def message_proportions(self): def most_used_words(self): return self.features.compute_most_used_words(self.messages, 10, 3) - + def all_features(self, **kargs): burst_thrs = kargs.get("burst_thrs", 3) - initiation_thrs = kargs.get("initiation_thrs", 60*60*8) - response_thrs = kargs.get("response_thrs", 60*60*3) + initiation_thrs = kargs.get("initiation_thrs", 60 * 60 * 8) + response_thrs = kargs.get("response_thrs", 60 * 60 * 3) pattern_list = kargs.get("pattern_list", []) top = kargs.get("top", 10) word_length_threshold = kargs.get("word_length_threshold", 3) - self.features.compute_response_time_and_burst(self.messages, self.root, self.senders, initiation_thrs, burst_thrs, response_thrs) - self.features.compute_messages_per_weekday(self.messages) + self.features.compute_response_time_and_burst(self.messages, self.root, self.senders, initiation_thrs, + burst_thrs, response_thrs) + self.features.compute_messages_per_weekday(self.messages) self.features.compute_messages_per_shift(self.messages) self.features.compute_messages_pattern(self.messages, self.senders, pattern_list) self.features.compute_message_proportions(self.messages, self.senders, self.root, self.get_contact()) self.features.compute_most_used_words(self.messages, top, word_length_threshold) def print_features(self): - print "Root: %s" % (self.senders[0]) - print "" + print("Root: %s" % (self.senders[0])) + print("") - print "Average root response time (s): %.2f" % (self.features.compute_avg_root_response_time()) - print "Average contact response time (s): %.2f" % (self.features.compute_avg_contact_response_time()) - print "Ratio: %.2f" % (self.features.compute_response_time_ratio(self.root, self.get_contact())) - print "" + print("Average root response time (s): %.2f" % (self.features.compute_avg_root_response_time())) + print("Average contact response time (s): %.2f" % (self.features.compute_avg_contact_response_time())) + print("Ratio: %.2f" % (self.features.compute_response_time_ratio(self.root, self.get_contact()))) + print("") # print "Number of root bursts: %d" % (self.features.compute_nbr_root_burst()) # print "Average burst length: %.2ff" % (self.features.compute_avg_root_burst()) # print "" - print "Number of contact bursts: %d" % (self.features.compute_nbr_contact_burst()) - print "Average burst length: %.2ff" % (self.features.compute_avg_contact_burst()) - print "Ratio: %.2f" % (self.features.compute_bursts_ratio(self.root, self.get_contact())) - print "" + print("Number of contact bursts: %d" % (self.features.compute_nbr_contact_burst())) + print("Average burst length: %.2ff" % (self.features.compute_avg_contact_burst())) + print("Ratio: %.2f" % (self.features.compute_bursts_ratio(self.root, self.get_contact()))) + print("") for s in self.senders: if s == self.root: - print "Root initiations: %d" % (self.features.initiations[s]) + print("Root initiations: %d" % (self.features.initiations[s])) else: - print "Contact initiations: %d" % (self.features.initiations[s]) + print("Contact initiations: %d" % (self.features.initiations[s])) - print "Root initiation ratio: %.2f" % (self.features.compute_root_initation_ratio(self.root, self.get_contact())) - print "" + print( + "Root initiation ratio: %.2f" % (self.features.compute_root_initation_ratio(self.root, self.get_contact()))) + print("") - print "Proportions:" + print("Proportions:") pretty_print(self.features.proportions, self.features.proportions.keys()[0], 1) - print "" - print "Weekdays:" + print("") + print("Weekdays:") pretty_print(self.features.weekday, "Weekday", 0) - print "" - print "Shifts:" + print("") + print("Shifts:") pretty_print(self.features.shifts, "Shifts", 0) - print "" - print "Patterns:" + print("") + print("Patterns:") pretty_print(self.features.patterns, "Patterns", 0) - print "" - print "Most used words:" + print("") + print("Most used words:") for muw in self.features.most_used_words: - try: - print muw[0] + try: + print(muw[0]) except UnicodeEncodeError: self.features.most_used_words.remove(muw) def save_features(self, output_name): - output = {} - output["root"] = self.root - output["avg_response_time"] = {} + output = {"root": self.root, "avg_response_time": {}} for s in self.senders: if s == self.root: output["avg_response_time"][s] = self.features.compute_avg_root_response_time() @@ -188,14 +185,16 @@ def save_features(self, output_name): output["initiations"] = self.features.initiations - output["initiations"]["root_initiation_ratio"] = self.features.compute_root_initation_ratio(self.root, self.get_contact()) + output["initiations"]["root_initiation_ratio"] = self.features.compute_root_initation_ratio(self.root, + self.get_contact()) output["proportions"] = self.features.proportions output["weekdays"] = self.features.weekday output["shifts"] = self.features.shifts output["patterns"] = self.features.patterns output["senders"] = self.senders output["muw"] = self.features.most_used_words - output["outcome"] = self.features.generate_outcome(self.root, self.get_contact(), 0) #TODO: make macros for outcome methodology + output["outcome"] = self.features.generate_outcome(self.root, self.get_contact(), + 0) # TODO: make macros for outcome methodology # if fallback to default path, make sure the hardcoded folder `log` is present in the folder if output_name == "./logs/basic_stats.json": if not os.path.exists('logs'): @@ -204,21 +203,25 @@ def save_features(self, output_name): if output_name.endswith(".json"): arq = open(output_name, "w") else: - arq = open(output_name+".json", "w") + arq = open(output_name + ".json", "w") arq.write(json.dumps(output, indent=4, sort_keys=True)) arq.close() # In case path (directory) mentioned by the user doesn't exist except IOError: - print "\nI/O Error: Following path doesn't exist:", output_name, "\n" + print("\nI/O Error: Following path doesn't exist:", output_name, "\n") exit(1) + def main(): parser = argparse.ArgumentParser(description='Chatlog Feature Extractor') parser.add_argument('-f', '--file', help='Chatlog file', required=True) parser.add_argument('-n', '--root', help='Root name', required=False) - parser.add_argument('-p', '--platform', help='Platform', choices=["WhatsApp", "Facebook"], default="WhatsApp", required=False) - parser.add_argument('-r', '--regexes', help='Regex patterns to compute frequency', nargs="+", required=False, default=[]) - parser.add_argument('-o', '--output', help='JSON output file name', required=False, default="./logs/basic_stats.json") + parser.add_argument('-p', '--platform', help='Platform', choices=["WhatsApp", "Facebook"], default="WhatsApp", + required=False) + parser.add_argument('-r', '--regexes', help='Regex patterns to compute frequency', nargs="+", required=False, + default=[]) + parser.add_argument('-o', '--output', help='JSON output file name', required=False, + default="./logs/basic_stats.json") args = vars(parser.parse_args()) @@ -227,7 +230,7 @@ def main(): c.parse_messages() if args.get("root") is None: for i, s in enumerate(c.senders): - print str(i), s + print(str(i), s) c.set_root(c.senders[int(raw_input("Please choose one person to be the root: "))]) else: c.set_root(args["root"]) @@ -235,6 +238,6 @@ def main(): c.print_features() c.save_features(args["output"]) + if __name__ == "__main__": main() - \ No newline at end of file From d93bb47909ca332d6f3f31524bc20b214f5e97c3 Mon Sep 17 00:00:00 2001 From: Manvendra Singh Date: Fri, 7 Apr 2017 23:09:32 +0530 Subject: [PATCH 2/3] use unittest class, better tests. --- test/out/testChat2_UsernameOne.out | 92 +++++++++++++++--------------- test/out/testChat2_UsernameTwo.out | 92 +++++++++++++++--------------- test/test_chat.py | 44 +++++++++++--- wp_parser/wp_chat.py | 9 ++- 4 files changed, 133 insertions(+), 104 deletions(-) diff --git a/test/out/testChat2_UsernameOne.out b/test/out/testChat2_UsernameOne.out index 3ddd321..d2bc681 100644 --- a/test/out/testChat2_UsernameOne.out +++ b/test/out/testChat2_UsernameOne.out @@ -13,55 +13,55 @@ Root initiations: 0 Root initiation ratio: 1.00 Proportions: - avg_words - ratio -> 0.625 - UsernameOne -> 2.75 - UsernameTwo -> 4.4 - media - total -> 1 - ratio -> 1.0 - UsernameTwo -> 1 - UsernameOne -> 1 - exclams - ratio -> 0.333333333333 - UsernameOne -> 1 - total -> 3 - UsernameTwo -> 3 - qmarks - UsernameTwo -> 1 - total -> 4 - ratio -> 4.0 - UsernameOne -> 4 - messages - ratio -> 0.8 - UsernameOne -> 4 - UsernameTwo -> 5 - total -> 9 - words - ratio -> 0.5 - UsernameOne -> 11 - UsernameTwo -> 22 - total -> 33 - chars - ratio -> 0.528 - UsernameOne -> 66 - UsernameTwo -> 125 - total -> 191 +(' ', 'avg_words') +(' ', 'ratio', '->', 0.625) +(' ', 'UsernameOne', '->', 2.75) +(' ', 'UsernameTwo', '->', 4.4) +(' ', 'media') +(' ', 'total', '->', 1) +(' ', 'ratio', '->', 1.0) +(' ', 'UsernameTwo', '->', 1) +(' ', 'UsernameOne', '->', 1) +(' ', 'exclams') +(' ', 'ratio', '->', 0.3333333333333333) +(' ', 'UsernameOne', '->', 1) +(' ', 'total', '->', 3) +(' ', 'UsernameTwo', '->', 3) +(' ', 'qmarks') +(' ', 'UsernameTwo', '->', 1) +(' ', 'total', '->', 4) +(' ', 'ratio', '->', 4.0) +(' ', 'UsernameOne', '->', 4) +(' ', 'messages') +(' ', 'ratio', '->', 0.8) +(' ', 'UsernameOne', '->', 4) +(' ', 'UsernameTwo', '->', 5) +(' ', 'total', '->', 9) +(' ', 'words') +(' ', 'ratio', '->', 0.5) +(' ', 'UsernameOne', '->', 11) +(' ', 'UsernameTwo', '->', 22) +(' ', 'total', '->', 33) +(' ', 'chars') +(' ', 'ratio', '->', 0.528) +(' ', 'UsernameOne', '->', 66) +(' ', 'UsernameTwo', '->', 125) +(' ', 'total', '->', 191) Weekdays: - Tuesday -> 0 - Friday -> 0 - Thursday -> 0 - Sunday -> 0 - Saturday -> 0 - Monday -> 1 - Wednesday -> 8 +('', 'Tuesday', '->', 0) +('', 'Friday', '->', 0) +('', 'Thursday', '->', 0) +('', 'Sunday', '->', 0) +('', 'Saturday', '->', 0) +('', 'Monday', '->', 1) +('', 'Wednesday', '->', 8) Shifts: - evening -> 0 - latenight -> 0 - morning -> 1 - afternoon -> 8 +('', 'evening', '->', 0) +('', 'latenight', '->', 0) +('', 'morning', '->', 1) +('', 'afternoon', '->', 8) Patterns: @@ -75,4 +75,4 @@ time hello, media exclam!!! - 1.0 - UsernameOne -> 2.75 - UsernameTwo -> 4.4 - media - UsernameOne -> 0 - total -> 1 - ratio -> 1.0 - UsernameTwo -> 1 - exclams - UsernameOne -> 0 - ratio -> 1.0 - total -> 3 - UsernameTwo -> 3 - qmarks - ratio -> 1.0 - UsernameTwo -> 1 - total -> 4 - UsernameOne -> 4 - messages - ratio -> 1.0 - UsernameOne -> 4 - UsernameTwo -> 5 - total -> 9 - words - ratio -> 1.0 - UsernameOne -> 11 - UsernameTwo -> 22 - total -> 33 - chars - ratio -> 1.0 - UsernameOne -> 66 - UsernameTwo -> 125 - total -> 191 +(' ', 'avg_words') +(' ', 'ratio', '->', 1.0) +(' ', 'UsernameOne', '->', 2.75) +(' ', 'UsernameTwo', '->', 4.4) +(' ', 'media') +(' ', 'UsernameOne', '->', 0) +(' ', 'total', '->', 1) +(' ', 'ratio', '->', 1.0) +(' ', 'UsernameTwo', '->', 1) +(' ', 'exclams') +(' ', 'UsernameOne', '->', 0) +(' ', 'ratio', '->', 1.0) +(' ', 'total', '->', 3) +(' ', 'UsernameTwo', '->', 3) +(' ', 'qmarks') +(' ', 'ratio', '->', 1.0) +(' ', 'UsernameTwo', '->', 1) +(' ', 'total', '->', 4) +(' ', 'UsernameOne', '->', 4) +(' ', 'messages') +(' ', 'ratio', '->', 1.0) +(' ', 'UsernameOne', '->', 4) +(' ', 'UsernameTwo', '->', 5) +(' ', 'total', '->', 9) +(' ', 'words') +(' ', 'ratio', '->', 1.0) +(' ', 'UsernameOne', '->', 11) +(' ', 'UsernameTwo', '->', 22) +(' ', 'total', '->', 33) +(' ', 'chars') +(' ', 'ratio', '->', 1.0) +(' ', 'UsernameOne', '->', 66) +(' ', 'UsernameTwo', '->', 125) +(' ', 'total', '->', 191) Weekdays: - Tuesday -> 0 - Friday -> 0 - Thursday -> 0 - Sunday -> 0 - Saturday -> 0 - Monday -> 1 - Wednesday -> 8 +('', 'Tuesday', '->', 0) +('', 'Friday', '->', 0) +('', 'Thursday', '->', 0) +('', 'Sunday', '->', 0) +('', 'Saturday', '->', 0) +('', 'Monday', '->', 1) +('', 'Wednesday', '->', 8) Shifts: - evening -> 0 - latenight -> 0 - morning -> 1 - afternoon -> 8 +('', 'evening', '->', 0) +('', 'latenight', '->', 0) +('', 'morning', '->', 1) +('', 'afternoon', '->', 8) Patterns: @@ -75,4 +75,4 @@ time hello, media exclam!!! - Date: Fri, 7 Apr 2017 23:15:03 +0530 Subject: [PATCH 3/3] travis config: use nose instead of py.test --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5738124..afd16d1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,6 @@ python: - "3.4" - "3.5" # - "3.6" -#install: -# - pip install . -script: py.test +install: + - pip install . +script: nosetests