diff --git a/README.md b/README.md index 317e479..0fd47c6 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ # Web Crawler Using Python This is a simple webcrawler to crawl a website. It uses python programming language to perform crawling. +Change the url to your needs as start function parameter. diff --git a/crawl words.py b/crawl words.py new file mode 100644 index 0000000..834e7c6 --- /dev/null +++ b/crawl words.py @@ -0,0 +1,57 @@ +import requests +from bs4 import BeautifulSoup +import operator +from collections import Counter + +def start(url): + + + wordlist = [] + source_code = requests.get(url).text + + + soup = BeautifulSoup(source_code, 'html.parser') + + + for each_text in soup.findAll('div', {'class':'entry-content'}): + content = each_text.text + + words = content.lower().split() + + for each_word in words: + wordlist.append(each_word) + clean_wordlist(wordlist) + + +def clean_wordlist(wordlist): + + clean_list =[] + for word in wordlist: + symbols = '!@#$%^&*()_-+={[}]|\;:"<>?/., ' + + for i in range (0, len(symbols)): + word = word.replace(symbols[i], '') + + if len(word) > 0: + clean_list.append(word) + create_dictionary(clean_list) + +def create_dictionary(clean_list): + word_count = {} + + for word in clean_list: + if word in word_count: + word_count[word] += 1 + else: + word_count[word] = 1 + + + c = Counter(word_count) + + # returns the most occurring elements + top = c.most_common(10) + print(top) + + +if __name__ == '__main__': + start("https://github.com")