diff --git a/final.py b/final.py new file mode 100644 index 0000000..e398433 --- /dev/null +++ b/final.py @@ -0,0 +1,94 @@ +import os + +import nltk + +from nltk import * +from nltk.corpus import stopwords +from nltk.tokenize import RegexpTokenizer +# +# +# +# filepath = os.path.join(os.getcwd(), "file2.txt") +# with open(filepath) as obj: +# fullData = obj.read() +# fullData = fullData.decode('utf8', 'ignore') + +def get_sentence(fullData): + sentence = fullData.replace("\n", ". ") + return sentence.split(". ") + + +def re_order(output_sents, input): + output_sents.sort(lambda s1, s2: + input.find(s1) - input.find(s2)) + return output_sents + +# get_sentence(fullData) + +def summary(fullData, summary_length): + myregex = RegexpTokenizer('\w+') + + # convert all the words to lowercase for ease of processing + words = [word.lower() for word in myregex.tokenize(fullData)] + + #remove all stopwords + word = [withoutStopwords for withoutStopwords in words if withoutStopwords not in stopwords.words()] + + frequency = FreqDist(word) + # print(frequency.most_common(50)) + + fifty_most_frequent = [wordd[0] for wordd in frequency.items()[:100]] + + # actual_sents contains all the original sentences with the case intact + my_token = nltk.data.load('tokenizers/punkt/english.pickle') #Visit http://www.nltk.org/_modules/nltk/tokenize/punkt.html for more information. + sentence = my_token.tokenize(fullData) + usable_sentence = [lines.lower() for lines in sentence] + # usable_sentence = get_sentence(fullData.lower()) + # usable_sentence = [item for item in get_sentence(fullData) if item != ''] + sentence_length = len(usable_sentence) + # usable_sentence = [] + # for item in usable_sentence: + # if item != '': + # usable_sentence.append(item) + + # total_sentences = len(usable_sentence) + #num_freqWords_inSents stores the number of times a frequent word occurs in each of the sentences + num_freqWords_inSents = [0] * sentence_length + + output_sents = [] + # Loop through all the sentences and all the words to see which sentences contain occurences of the most frequent words + for elem in fifty_most_frequent: + for i in range(0,sentence_length): + if elem in usable_sentence[i] and sentence[i] not in output_sents: + output_sents.append(usable_sentence[i]) + break + if len(output_sents) >= summary_length: break + if len(output_sents) >= summary_length: break + # num_freqWords_inSents[i] = num_freqWords_inSents[i] + 1 + # if (usable_sentence[i] not in output_sents): + # output_sents.append(usable_sentence[i]) + + #store the contents of usable_sentence in a list so that it can be used later + # num_freqWords_inSents[:] = [something for something in num_freqWords_inSents if something != 0] + + #map the elements in two lists using the zip function for easy comparision + # zipped = zip(num_freqWords_inSents, output_sents) + + #sort the output so that the sentences which contain the most frequent words is displayed first. + # sorted_output = [two for (one,two) in sorted(zipped, reverse=True)] + + #take the first two sentence with the most frequent words + # final_output = sorted_output[0:7] + + # out_sentence = re_order(final_output, fullData) + + # return " ".join(final_output) + return re_order(output_sents, fullData) + +def get_summary(fullData,summary_length): + print "\n".join(summary(fullData, summary_length)) + + + + + diff --git a/runner11.py b/runner11.py new file mode 100644 index 0000000..842cb62 --- /dev/null +++ b/runner11.py @@ -0,0 +1,22 @@ +import os +from final import get_sentence, get_summary + +filepath = os.path.join(os.getcwd(), "file2.txt") +with open(filepath) as obj: + fullData = obj.read() + fullData = fullData.decode('utf8', 'ignore') + + + +title_of_summary = get_sentence(fullData) +title_of_summary = title_of_summary[0] +# last_line = title_of_summary[-1] +print "\t\t\t\t\t\t\t\t\t*---------------------------------Title------------------------------*\n%s"%title_of_summary + +summary_length = raw_input("Enter the number of lines you want your summary to be.\n") +summary_length = int(summary_length) +print("\n\t\t\t\t\t\t\t\t\t*---------------------------------Summary------------------------------*\n") +ss = get_summary(fullData, summary_length) +# print "\t\t\t\t\t\t\t\t\t*---------------------------------Title------------------------------*\n%s" % last_line + +