diff --git a/utilities/kym_scrape.py b/utilities/kym_scrape.py
index 7471131..8ea390d 100755
--- a/utilities/kym_scrape.py
+++ b/utilities/kym_scrape.py
@@ -4,7 +4,7 @@
Know Your Meme Scraper
Grabs all titles from https://knowyourmeme.com
-Puts output into memes.txt
+Sends output to stdout.
Used to feed into password cracking wordlists like
https://github.com/initstring/passphrase-wordlist
@@ -13,10 +13,11 @@
"""
import html
-import time
+import itertools
import re
import sys
-import requests
+import time
+import urllib.request
# The "all" URL allows autoscrolling
KYM_URL = 'https://knowyourmeme.com/memes/all/page'
@@ -24,84 +25,42 @@
# Regex to grab all formatted titles
RE_TITLE = re.compile(r'
]*>\s*(.*?)\s*
')
-# Text to know when we reached end of line
-NO_MORE = 'There are no entries for this category'
-
# Need real headers to get past WAF
HEADERS = {'User-Agent': 'Mozilla/5.0'}
-# Out file
-OUTFILE = 'memes.txt'
-
-# File for in-process scraping
-LOGFILE = 'memes-incomplete.txt'
-
# Sleep to avoid IP ban
SLEEP = 3
-def write_log(phrases):
- """
- Logs phrases as the program runs
-
- Used for troubleshooting or to at least have _something_ in the case of
- IP ban, failure, etc
- """
- with open(LOGFILE, 'a') as logfile:
- for phrase in phrases:
- phrase = html.unescape(phrase)
- logfile.write(phrase + '\n')
-
-def write_final(phrases):
- """
- Writes all phrases to a log file
- """
- # Unescape the HTML and write the phrases out
- with open(OUTFILE, 'w') as outfile:
- for phrase in phrases:
- phrase = html.unescape(phrase)
- outfile.write(phrase + '\n')
-
def scrape_pages():
"""
Loops through all pages of kym
"""
- page = 0
- phrases = set([])
+ phrases = set()
- while True:
+ for page in itertools.count():
# Build the URL based on auto-scroll behaviour
- url = "{}/{}".format(KYM_URL, page)
- response = requests.get(url, headers=HEADERS)
-
- # Check for IP ban
- if response.status_code == 403:
- print("\n[!] You have been IP banned. Oops.")
- sys.exit()
-
- # Return if no more results
- if response.status_code == 404:
- print("\n[*] Reached end of line at page {}. Exiting"
- .format(page))
- return phrases
-
- # Clear stdout for ongoing notifications
- sys.stdout.flush()
- sys.stdout.write(" " * 20)
- sys.stdout.write("\r")
+ req = urllib.request.Request(f"{KYM_URL}/{page}", headers=HEADERS)
+ try:
+ response = urllib.request.urlopen(req)
+ except urllib.error.HTTPError as e:
+ # Check for IP ban
+ if e.code == 403:
+ sys.exit("\n[!] You have been IP banned. Oops.")
+
+ # Return if no more results
+ if e.code == 404:
+ print(f"\n[*] Reached end of line at page {page}. Exiting", file=sys.stderr)
+ return phrases
# Grab phrases from the raw text and add to set
- new_phrases = re.findall(RE_TITLE, response.text)
+ new_phrases = set(RE_TITLE.findall(response.read().decode()))
+ for new_phrase in new_phrases - phrases:
+ print(html.unescape(new_phrase))
phrases.update(new_phrases)
- # Write the new phrases to an ongoing logile
- write_log(new_phrases)
-
# Update the patiently waiting user
- sys.stdout.write("[*] Page: {}, Phrases: {}, Unique Phrases: {}"
- .format(page, len(new_phrases), len(phrases)))
-
- # Increment the page for the next loop
- page += 1
+ sys.stderr.write(f"[*] Page: {page}, Phrases: {len(new_phrases)}, Unique Phrases: {len(phrases)}\r")
+ sys.stderr.flush()
# Sleep to avoid IP ban
time.sleep(SLEEP)
@@ -111,12 +70,8 @@ def main():
"""
Main program function
"""
- print("[*] Scraping all pages of KYM...")
- phrases = scrape_pages()
-
- print("[+] Found {} phrases, writing to {}..."
- .format(len(phrases), OUTFILE))
- write_final(phrases)
+ print("[*] Scraping all pages of KYM...", file=sys.stderr)
+ scrape_pages()
if __name__ == "__main__":
diff --git a/utilities/updating-sources.md b/utilities/updating-sources.md
index 04a4848..adc97b0 100644
--- a/utilities/updating-sources.md
+++ b/utilities/updating-sources.md
@@ -7,30 +7,28 @@ Some of the source files get regular updates. Below is a guide to obtaining thos
## IMDB titles
```
-wget https://datasets.imdbws.com/title.basics.tsv.gz
-gunzip ./title.basics.tsv.gz
-cat title.basics.tsv | awk -F '\t' '{print $3}' > ./imdb-titles-$(date +%Y-%m-%d).txt
-rm title.basics.tsv
+curl https://datasets.imdbws.com/title.basics.tsv.gz \
+ | gunzip \
+ | awk -F '\t' '{print $3}' \
+ > ./imdb-titles-$(date +%Y-%m-%d).txt
```
## Wikipedia article titles & category names
```
-wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2
-gunzip2 ./enwiki-latest-pages-articles-multistream-index.txt.bz2
-cat ./enwiki-latest-pages-articles-multistream-index.txt | cut -d: -f 3 > ./wikipedia-$(date +%Y-%m-%d).txt
-rm enwiki-latest-pages-articles-multistream-index.txt
-
+curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2 \
+ | bzip2 \
+ | cut -d: -f 3 \
+ > ./wikipedia-$(date +%Y-%m-%d).txt
```
## Wiktionary titles
```
-wget https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz
-gunzip enwiktionary-latest-all-titles.gz
-cat enwiktionary-latest-all-titles | awk -F '\t' '{print $2}' > ./wiktionary-$(date +%Y-%m-%d).txt
-rm enwiktionary-latest-all-titles
-
+curl https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz \
+ | gunzip \
+ | awk -F '\t' '{print $2}' \
+ > ./wiktionary-$(date +%Y-%m-%d).txt
```
## Urban Dictionary
@@ -45,18 +43,17 @@ python3 ./main.py --out urban-dictionary-$(date +%Y-%m-%d).txt
## Know Your Meme
```
-python3 /utilities/kym_scrape.py
-mv memes.txt ./know-your-meme-$(date +%Y-%m-%d).txt
+python3 utilities/kym_scrape.py > know-your-meme-$(date +%Y-%m-%d).txt
```
## Global POI dataset
```
wget http://download.geonames.org/export/dump/allCountries.zip
-unzip ./allCountries.zip
-cat allCountries.txt | awk -F '\t' '{print $3}' > ./global-poi-$(date +%Y-%m-%d).txt
+unzip -p ./allCountries.zip allCountries.txt \
+ | awk -F '\t' '{print $3}' \
+ > ./global-poi-$(date +%Y-%m-%d).txt
rm allCountries.zip
-rm allCountries.txt
```
## Billboard charts
@@ -84,4 +81,4 @@ If you generate a new version and want to compare what's new you can use a comma
```
sort new.txt old.txt | uniq -u
-```
\ No newline at end of file
+```