-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathreadableTextFile.py
More file actions
65 lines (60 loc) · 2.2 KB
/
readableTextFile.py
File metadata and controls
65 lines (60 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re
class ReadableTextFile:
def __init__(self):
self.data = {"sender": "", "date": "", "subject": "",
"directRecipients": [], "ccRecipients": [],
"messageBody": ""}
self.dynamicFields = ["subject", "directRecipients", "ccRecipients"]
self.fieldBreaks = ["Subject: ", "To: ", "CC: ", "Content-Type: "]
def parseMetaData(self, corpus):
'''Assumes we are getting the current result from the trimFileEdges
method in the parseFile file, grabs everything currently in the data
dictionary minus the message body which will be handled seperately'''
self.data["sender"] = self.grabEmails(corpus[0])
self.data["date"] = corpus[1].strip()[5:] #Very reliable placement of these two
field = 0
dataField = []
appending = False
body = False
for line in corpus:
if appending is False:
if self.fieldBreaks[field] in line:
appending = True
if appending is True:
if self.fieldBreaks[field+1] in line:
if field == 0:
self.data[self.dynamicFields[field]] = ' '.join(dataField)
else:
self.data[self.dynamicFields[field]] = dataField
field += 1
if field == 3:
break
dataField = self.grabEmails(line)
else:
if field == 0:
dataField.append(line[len(self.fieldBreaks[field]):].strip())
else:
dataField.extend(self.grabEmails(line))
def parseMessageBody(self, corpus):
'''To be run after metadata is collected, requires an accurate
subject field to recognize the start of the message body.
Corpus should still be the same as in the parsing of metaData'''
body = []
tripped = False
subject = self.data["subject"].lower().strip()
for line in corpus:
if tripped is False:
testLine = line.lower()
if subject in testLine:
erroneousMatch = "subject: " + self.data["subject"].lower()
if erroneousMatch not in testLine:
cutOff = testLine.find(subject) + len(subject)
firstLine = line[cutOff:].strip()
body.append(firstLine)
tripped = True
elif tripped is True:
body.append(re.sub("\*\\t", "", line.strip()))
self.data["messageBody"] = ' '.join(body)
def grabEmails(self, line):
'''Grabs all email addresses from a given line'''
return re.findall(r'[\w\.-]+@[\w\.-]+', line)