Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 25 additions & 70 deletions utilities/kym_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Know Your Meme Scraper
Grabs all titles from https://knowyourmeme.com

Puts output into memes.txt
Sends output to stdout.

Used to feed into password cracking wordlists like
https://github.com/initstring/passphrase-wordlist
Expand All @@ -13,95 +13,54 @@
"""

import html
import time
import itertools
import re
import sys
import requests
import time
import urllib.request

# The "all" URL allows autoscrolling
KYM_URL = 'https://knowyourmeme.com/memes/all/page'

# Regex to grab all formatted titles
RE_TITLE = re.compile(r'<h3[^>]*>\s*(.*?)\s*</h3>')

# Text to know when we reached end of line
NO_MORE = 'There are no entries for this category'

# Need real headers to get past WAF
HEADERS = {'User-Agent': 'Mozilla/5.0'}

# Out file
OUTFILE = 'memes.txt'

# File for in-process scraping
LOGFILE = 'memes-incomplete.txt'

# Sleep to avoid IP ban
SLEEP = 3

def write_log(phrases):
"""
Logs phrases as the program runs

Used for troubleshooting or to at least have _something_ in the case of
IP ban, failure, etc
"""
with open(LOGFILE, 'a') as logfile:
for phrase in phrases:
phrase = html.unescape(phrase)
logfile.write(phrase + '\n')

def write_final(phrases):
"""
Writes all phrases to a log file
"""
# Unescape the HTML and write the phrases out
with open(OUTFILE, 'w') as outfile:
for phrase in phrases:
phrase = html.unescape(phrase)
outfile.write(phrase + '\n')

def scrape_pages():
"""
Loops through all pages of kym
"""
page = 0
phrases = set([])
phrases = set()

while True:
for page in itertools.count():
# Build the URL based on auto-scroll behaviour
url = "{}/{}".format(KYM_URL, page)
response = requests.get(url, headers=HEADERS)

# Check for IP ban
if response.status_code == 403:
print("\n[!] You have been IP banned. Oops.")
sys.exit()

# Return if no more results
if response.status_code == 404:
print("\n[*] Reached end of line at page {}. Exiting"
.format(page))
return phrases

# Clear stdout for ongoing notifications
sys.stdout.flush()
sys.stdout.write(" " * 20)
sys.stdout.write("\r")
req = urllib.request.Request(f"{KYM_URL}/{page}", headers=HEADERS)
try:
response = urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
# Check for IP ban
if e.code == 403:
sys.exit("\n[!] You have been IP banned. Oops.")

# Return if no more results
if e.code == 404:
print(f"\n[*] Reached end of line at page {page}. Exiting", file=sys.stderr)
return phrases

# Grab phrases from the raw text and add to set
new_phrases = re.findall(RE_TITLE, response.text)
new_phrases = set(RE_TITLE.findall(response.read().decode()))
for new_phrase in new_phrases - phrases:
print(html.unescape(new_phrase))
phrases.update(new_phrases)

# Write the new phrases to an ongoing logile
write_log(new_phrases)

# Update the patiently waiting user
sys.stdout.write("[*] Page: {}, Phrases: {}, Unique Phrases: {}"
.format(page, len(new_phrases), len(phrases)))

# Increment the page for the next loop
page += 1
sys.stderr.write(f"[*] Page: {page}, Phrases: {len(new_phrases)}, Unique Phrases: {len(phrases)}\r")
sys.stderr.flush()

# Sleep to avoid IP ban
time.sleep(SLEEP)
Expand All @@ -111,12 +70,8 @@ def main():
"""
Main program function
"""
print("[*] Scraping all pages of KYM...")
phrases = scrape_pages()

print("[+] Found {} phrases, writing to {}..."
.format(len(phrases), OUTFILE))
write_final(phrases)
print("[*] Scraping all pages of KYM...", file=sys.stderr)
scrape_pages()


if __name__ == "__main__":
Expand Down
37 changes: 17 additions & 20 deletions utilities/updating-sources.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,30 +7,28 @@ Some of the source files get regular updates. Below is a guide to obtaining thos
## IMDB titles

```
wget https://datasets.imdbws.com/title.basics.tsv.gz
gunzip ./title.basics.tsv.gz
cat title.basics.tsv | awk -F '\t' '{print $3}' > ./imdb-titles-$(date +%Y-%m-%d).txt
rm title.basics.tsv
curl https://datasets.imdbws.com/title.basics.tsv.gz \
| gunzip \
| awk -F '\t' '{print $3}' \
> ./imdb-titles-$(date +%Y-%m-%d).txt
```

## Wikipedia article titles & category names

```
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2
gunzip2 ./enwiki-latest-pages-articles-multistream-index.txt.bz2
cat ./enwiki-latest-pages-articles-multistream-index.txt | cut -d: -f 3 > ./wikipedia-$(date +%Y-%m-%d).txt
rm enwiki-latest-pages-articles-multistream-index.txt

curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2 \
| bzip2 \
| cut -d: -f 3 \
> ./wikipedia-$(date +%Y-%m-%d).txt
```

## Wiktionary titles

```
wget https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz
gunzip enwiktionary-latest-all-titles.gz
cat enwiktionary-latest-all-titles | awk -F '\t' '{print $2}' > ./wiktionary-$(date +%Y-%m-%d).txt
rm enwiktionary-latest-all-titles

curl https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz \
| gunzip \
| awk -F '\t' '{print $2}' \
> ./wiktionary-$(date +%Y-%m-%d).txt
```

## Urban Dictionary
Expand All @@ -45,18 +43,17 @@ python3 ./main.py --out urban-dictionary-$(date +%Y-%m-%d).txt
## Know Your Meme

```
python3 /utilities/kym_scrape.py
mv memes.txt ./know-your-meme-$(date +%Y-%m-%d).txt
python3 utilities/kym_scrape.py > know-your-meme-$(date +%Y-%m-%d).txt
```

## Global POI dataset

```
wget http://download.geonames.org/export/dump/allCountries.zip
unzip ./allCountries.zip
cat allCountries.txt | awk -F '\t' '{print $3}' > ./global-poi-$(date +%Y-%m-%d).txt
unzip -p ./allCountries.zip allCountries.txt \
| awk -F '\t' '{print $3}' \
> ./global-poi-$(date +%Y-%m-%d).txt
rm allCountries.zip
rm allCountries.txt
```

## Billboard charts
Expand Down Expand Up @@ -84,4 +81,4 @@ If you generate a new version and want to compare what's new you can use a comma

```
sort new.txt old.txt | uniq -u
```
```