From 1e65d58aa95d9131dda2144740fdd63e773318a2 Mon Sep 17 00:00:00 2001 From: bvpranu97 <49627284+bvpranu97@users.noreply.github.com> Date: Fri, 2 Oct 2020 10:15:31 +0530 Subject: [PATCH 1/2] Add files via upload --- crawl words.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 crawl words.py diff --git a/crawl words.py b/crawl words.py new file mode 100644 index 0000000..834e7c6 --- /dev/null +++ b/crawl words.py @@ -0,0 +1,57 @@ +import requests +from bs4 import BeautifulSoup +import operator +from collections import Counter + +def start(url): + + + wordlist = [] + source_code = requests.get(url).text + + + soup = BeautifulSoup(source_code, 'html.parser') + + + for each_text in soup.findAll('div', {'class':'entry-content'}): + content = each_text.text + + words = content.lower().split() + + for each_word in words: + wordlist.append(each_word) + clean_wordlist(wordlist) + + +def clean_wordlist(wordlist): + + clean_list =[] + for word in wordlist: + symbols = '!@#$%^&*()_-+={[}]|\;:"<>?/., ' + + for i in range (0, len(symbols)): + word = word.replace(symbols[i], '') + + if len(word) > 0: + clean_list.append(word) + create_dictionary(clean_list) + +def create_dictionary(clean_list): + word_count = {} + + for word in clean_list: + if word in word_count: + word_count[word] += 1 + else: + word_count[word] = 1 + + + c = Counter(word_count) + + # returns the most occurring elements + top = c.most_common(10) + print(top) + + +if __name__ == '__main__': + start("https://github.com") From 85fbd305b4621de916e4c3dbec9c20b5f62673be Mon Sep 17 00:00:00 2001 From: bvpranu97 <49627284+bvpranu97@users.noreply.github.com> Date: Fri, 2 Oct 2020 10:16:27 +0530 Subject: [PATCH 2/2] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 317e479..0fd47c6 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ # Web Crawler Using Python This is a simple webcrawler to crawl a website. It uses python programming language to perform crawling. +Change the url to your needs as start function parameter.