Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Web Crawler Using Python
This is a simple webcrawler to crawl a website.
It uses python programming language to perform crawling.
Change the url to your needs as start function parameter.
57 changes: 57 additions & 0 deletions crawl words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import requests
from bs4 import BeautifulSoup
import operator
from collections import Counter

def start(url):


wordlist = []
source_code = requests.get(url).text


soup = BeautifulSoup(source_code, 'html.parser')


for each_text in soup.findAll('div', {'class':'entry-content'}):
content = each_text.text

words = content.lower().split()

for each_word in words:
wordlist.append(each_word)
clean_wordlist(wordlist)


def clean_wordlist(wordlist):

clean_list =[]
for word in wordlist:
symbols = '!@#$%^&*()_-+={[}]|\;:"<>?/., '

for i in range (0, len(symbols)):
word = word.replace(symbols[i], '')

if len(word) > 0:
clean_list.append(word)
create_dictionary(clean_list)

def create_dictionary(clean_list):
word_count = {}

for word in clean_list:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1


c = Counter(word_count)

# returns the most occurring elements
top = c.most_common(10)
print(top)


if __name__ == '__main__':
start("https://github.com")