diff --git a/utilities/kym_scrape.py b/utilities/kym_scrape.py index 7471131..8ea390d 100755 --- a/utilities/kym_scrape.py +++ b/utilities/kym_scrape.py @@ -4,7 +4,7 @@ Know Your Meme Scraper Grabs all titles from https://knowyourmeme.com -Puts output into memes.txt +Sends output to stdout. Used to feed into password cracking wordlists like https://github.com/initstring/passphrase-wordlist @@ -13,10 +13,11 @@ """ import html -import time +import itertools import re import sys -import requests +import time +import urllib.request # The "all" URL allows autoscrolling KYM_URL = 'https://knowyourmeme.com/memes/all/page' @@ -24,84 +25,42 @@ # Regex to grab all formatted titles RE_TITLE = re.compile(r']*>\s*(.*?)\s*') -# Text to know when we reached end of line -NO_MORE = 'There are no entries for this category' - # Need real headers to get past WAF HEADERS = {'User-Agent': 'Mozilla/5.0'} -# Out file -OUTFILE = 'memes.txt' - -# File for in-process scraping -LOGFILE = 'memes-incomplete.txt' - # Sleep to avoid IP ban SLEEP = 3 -def write_log(phrases): - """ - Logs phrases as the program runs - - Used for troubleshooting or to at least have _something_ in the case of - IP ban, failure, etc - """ - with open(LOGFILE, 'a') as logfile: - for phrase in phrases: - phrase = html.unescape(phrase) - logfile.write(phrase + '\n') - -def write_final(phrases): - """ - Writes all phrases to a log file - """ - # Unescape the HTML and write the phrases out - with open(OUTFILE, 'w') as outfile: - for phrase in phrases: - phrase = html.unescape(phrase) - outfile.write(phrase + '\n') - def scrape_pages(): """ Loops through all pages of kym """ - page = 0 - phrases = set([]) + phrases = set() - while True: + for page in itertools.count(): # Build the URL based on auto-scroll behaviour - url = "{}/{}".format(KYM_URL, page) - response = requests.get(url, headers=HEADERS) - - # Check for IP ban - if response.status_code == 403: - print("\n[!] You have been IP banned. Oops.") - sys.exit() - - # Return if no more results - if response.status_code == 404: - print("\n[*] Reached end of line at page {}. Exiting" - .format(page)) - return phrases - - # Clear stdout for ongoing notifications - sys.stdout.flush() - sys.stdout.write(" " * 20) - sys.stdout.write("\r") + req = urllib.request.Request(f"{KYM_URL}/{page}", headers=HEADERS) + try: + response = urllib.request.urlopen(req) + except urllib.error.HTTPError as e: + # Check for IP ban + if e.code == 403: + sys.exit("\n[!] You have been IP banned. Oops.") + + # Return if no more results + if e.code == 404: + print(f"\n[*] Reached end of line at page {page}. Exiting", file=sys.stderr) + return phrases # Grab phrases from the raw text and add to set - new_phrases = re.findall(RE_TITLE, response.text) + new_phrases = set(RE_TITLE.findall(response.read().decode())) + for new_phrase in new_phrases - phrases: + print(html.unescape(new_phrase)) phrases.update(new_phrases) - # Write the new phrases to an ongoing logile - write_log(new_phrases) - # Update the patiently waiting user - sys.stdout.write("[*] Page: {}, Phrases: {}, Unique Phrases: {}" - .format(page, len(new_phrases), len(phrases))) - - # Increment the page for the next loop - page += 1 + sys.stderr.write(f"[*] Page: {page}, Phrases: {len(new_phrases)}, Unique Phrases: {len(phrases)}\r") + sys.stderr.flush() # Sleep to avoid IP ban time.sleep(SLEEP) @@ -111,12 +70,8 @@ def main(): """ Main program function """ - print("[*] Scraping all pages of KYM...") - phrases = scrape_pages() - - print("[+] Found {} phrases, writing to {}..." - .format(len(phrases), OUTFILE)) - write_final(phrases) + print("[*] Scraping all pages of KYM...", file=sys.stderr) + scrape_pages() if __name__ == "__main__": diff --git a/utilities/updating-sources.md b/utilities/updating-sources.md index 04a4848..adc97b0 100644 --- a/utilities/updating-sources.md +++ b/utilities/updating-sources.md @@ -7,30 +7,28 @@ Some of the source files get regular updates. Below is a guide to obtaining thos ## IMDB titles ``` -wget https://datasets.imdbws.com/title.basics.tsv.gz -gunzip ./title.basics.tsv.gz -cat title.basics.tsv | awk -F '\t' '{print $3}' > ./imdb-titles-$(date +%Y-%m-%d).txt -rm title.basics.tsv +curl https://datasets.imdbws.com/title.basics.tsv.gz \ + | gunzip \ + | awk -F '\t' '{print $3}' \ + > ./imdb-titles-$(date +%Y-%m-%d).txt ``` ## Wikipedia article titles & category names ``` -wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2 -gunzip2 ./enwiki-latest-pages-articles-multistream-index.txt.bz2 -cat ./enwiki-latest-pages-articles-multistream-index.txt | cut -d: -f 3 > ./wikipedia-$(date +%Y-%m-%d).txt -rm enwiki-latest-pages-articles-multistream-index.txt - +curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2 \ + | bzip2 \ + | cut -d: -f 3 \ + > ./wikipedia-$(date +%Y-%m-%d).txt ``` ## Wiktionary titles ``` -wget https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz -gunzip enwiktionary-latest-all-titles.gz -cat enwiktionary-latest-all-titles | awk -F '\t' '{print $2}' > ./wiktionary-$(date +%Y-%m-%d).txt -rm enwiktionary-latest-all-titles - +curl https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz \ + | gunzip \ + | awk -F '\t' '{print $2}' \ + > ./wiktionary-$(date +%Y-%m-%d).txt ``` ## Urban Dictionary @@ -45,18 +43,17 @@ python3 ./main.py --out urban-dictionary-$(date +%Y-%m-%d).txt ## Know Your Meme ``` -python3 /utilities/kym_scrape.py -mv memes.txt ./know-your-meme-$(date +%Y-%m-%d).txt +python3 utilities/kym_scrape.py > know-your-meme-$(date +%Y-%m-%d).txt ``` ## Global POI dataset ``` wget http://download.geonames.org/export/dump/allCountries.zip -unzip ./allCountries.zip -cat allCountries.txt | awk -F '\t' '{print $3}' > ./global-poi-$(date +%Y-%m-%d).txt +unzip -p ./allCountries.zip allCountries.txt \ + | awk -F '\t' '{print $3}' \ + > ./global-poi-$(date +%Y-%m-%d).txt rm allCountries.zip -rm allCountries.txt ``` ## Billboard charts @@ -84,4 +81,4 @@ If you generate a new version and want to compare what's new you can use a comma ``` sort new.txt old.txt | uniq -u -``` \ No newline at end of file +```