diff --git a/CHANGELOG.md b/CHANGELOG.md index f04c36b..23d133e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,14 @@ # PyMeta Change Log All notable changes to this project will be documented in this file. +## [1.3.0] +### Added +* Google Custom Search API support with `--api-key` and `--search-engine-id` flags + +### Changed +* Updated installation documentation to recommend pipx over pip +* Added direct GitHub installation instructions + ## [1.2.0] ### Added * Users can now limit or specify searched file types diff --git a/README.md b/README.md index 793fb4e..f161f81 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ PyMeta is a Python3 rewrite of the tool [PowerMeta](https://github.com/dafthack/ Once downloaded, metadata is extracted from these files using Phil Harvey's [exiftool](https://sno.phy.queensu.ca/~phil/exiftool/) and added to a ```.csv``` report. Alternatively, Pymeta can be pointed at a directory to extract metadata from files manually downloaded using the ```-dir``` command line argument. See the [Usage](#Usage), or [All Options](#All-Options) section for more information. +> **Note:** Due to Google's increasingly aggressive anti-bot measures, web scraping may yield limited results. For improved reliability and more consistent results, consider using the Google Custom Search API with the `--api-key` and `--search-engine-id` flags. + #### Why? Metadata is a common place for penetration testers and red teamers to find: domains, user accounts, naming conventions, software/version numbers, and more! @@ -24,25 +26,100 @@ Metadata is a common place for penetration testers and red teamers to find: doma     **Mac OS** - ```brew install exiftool``` ### Install: -Install the last stable release from PyPi: +**Recommended: Install with pipx (preferred method)** + +Using [pipx](https://pypa.github.io/pipx/) is the preferred installation method as it installs PyMeta in an isolated environment, preventing conflicts with system Python packages: + ```commandline -pip3 install pymetasec +pipx install pymetasec ``` -Or, install the most recent code from GitHub: +If you don't have pipx installed: +```commandline +# Ubuntu/Debian +sudo apt install pipx +pipx ensurepath + +# Mac OS +brew install pipx +pipx ensurepath + +# Or via pip (then restart your shell) +python3 -m pip install --user pipx +python3 -m pipx ensurepath ``` + +**Alternative: Install with pip** + +You can also install directly with pip, though this may affect system Python packages: +```commandline +pip3 install pymetasec +``` + +**Install from source:** + +Clone the repository and install locally: +```bash git clone https://github.com/m8sec/pymeta cd pymeta -python3 setup.py install +pipx install . +# Or with pip: pip3 install . +``` + +Or install directly from GitHub without cloning: +```bash +pipx install git+https://github.com/m8sec/pymeta +# Or with pip: pip3 install git+https://github.com/m8sec/pymeta ``` ## Usage + +### Standard Search (Web Scraping) * Search Google and Bing for files within example.com and extract metadata to a csv report:
```pymeta -d example.com``` * Extract metadata from files within the given directory and create csv report:
```pymeta -dir Downloads/``` +### Google API Search +Due to Google's aggressive anti-bot protections, web scraping may produce limited results. For better reliability, use the Google API option: + +```pymeta -d example.com --api-key "your_api_key_here" --search-engine-id "your_search_engine_id"``` + +#### Setting up Google API + +**Step 1: Create Google Cloud Project** +- Go to [Google Cloud Console](https://cloud.google.com/) +- Login with a Google account +- Click "Select a project" → "New Project" +- Enter a project name (e.g., "PyMeta-API") +- Click "Create" + +**Step 2: Enable Custom Search API** +- In your project, go to "APIs & Services" → "Library" +- Search for "Custom Search API" +- Click on it and press "Enable" + +**Step 3: Create API Key** +- Go to "APIs & Services" → "Credentials" +- Click "Create Credentials" → "API Key" +- Copy your API key (you'll need this for the `--api-key` flag) + +**Step 4: Create Custom Search Engine** +- Go to [Google Programmable Search Engine](https://programmablesearchengine.google.com/) +- Click "Add a search engine" +- Enter any name (e.g., "PyMeta Search") +- For "Sites to search", select "Search the entire web" +- Click "Create" +- Copy your Search Engine ID (you'll need this for the `--search-engine-id` flag) + +**API Usage Notes:** +- Google provides 100 free API calls per day +- Additional requests cost $5 per 1000 queries +- API searches are more reliable than web scraping and less likely to be blocked +- When using API mode, only Google search is used (Bing searches are disabled) + +> **NOTE**: Thanks to Beau Bullock [(@dafthack)](https://twitter.com/dafthack) and the [PowerMeta](https://github.com/dafthack/PowerMeta) project for the above steps on getting a Google API key. ## All Options ``` @@ -57,6 +134,10 @@ Search Options: --file-type FILE_TYPE File types to search (default=pdf,xls,xlsx,csv,doc,docx,ppt,pptx) -m MAX_RESULTS Max results per type search +Google API Options: + --api-key API_KEY Google API key for Custom Search API + --search-engine-id ID Google Custom Search Engine ID + Proxy Options: --proxy PROXY Proxy requests (IP:Port) --proxy-file PROXY Load proxies from file for rotation diff --git a/pymeta/__init__.py b/pymeta/__init__.py index 461aa3a..1c8e97d 100644 --- a/pymeta/__init__.py +++ b/pymeta/__init__.py @@ -11,12 +11,12 @@ from pymeta import utils from pymeta.logger import * from subprocess import getoutput -from pymeta.search import PyMeta, download_file +from pymeta.search import PyMeta, PyMetaAPI, download_file def status(args): - VERSION = 'v1.2.0' + VERSION = 'v1.3.0' print("\nPyMeta {} - {}\n".format(VERSION, highlight("by @m8sec", "bold", "gray"))) @@ -24,7 +24,13 @@ def status(args): return Log.info("Target Domain : {}".format(highlight(args.domain if args.domain else args.file_dir, "bold", "gray"), )) - Log.info("Search Engines(s) : {}".format(highlight(', '.join(args.engine), "bold", "gray"))) + + # Show API mode status + if args.api_key and args.search_engine_id: + Log.info("Search Mode : {}".format(highlight("Google Custom Search API", "bold", "green"))) + else: + Log.info("Search Engines(s) : {}".format(highlight(', '.join(args.engine), "bold", "gray"))) + Log.info("File Types(s) : {}".format(highlight(', '.join(args.file_type), "bold", "gray"), )) Log.info("Max Downloads : {}\n".format(highlight(args.max_results, "bold", "gray"))) @@ -41,6 +47,10 @@ def cli(): search.add_argument('--file-type', default='pdf,xls,xlsx,csv,doc,docx,ppt,pptx', type=lambda x: utils.delimiter2list(x), help='File types to search') search.add_argument('-m', dest='max_results', type=int, default=50, help='Max results per type search') + api = args.add_argument_group("Google API Options") + api.add_argument('--api-key', dest='api_key', type=str, default=None, help='Google API key for Custom Search API') + api.add_argument('--search-engine-id', dest='search_engine_id', type=str, default=None, help='Google Custom Search Engine ID') + p = args.add_argument_group("Proxy Options") pr = p.add_mutually_exclusive_group(required=False) pr.add_argument('--proxy', dest='proxy', action='append', default=[], help='Proxy requests (IP:Port)') @@ -59,14 +69,25 @@ def cli(): def start_scrape(args): tmp = [] - Log.info('Searching {} for {} file type(s) on "{}"'.format(', '.join(args.engine), len(args.file_type), args.domain)) - - for file_type in args.file_type: - for search_engine in args.engine: - pym = PyMeta(search_engine, args.domain, file_type, args.timeout, 3, args.proxy, args.jitter, args.max_results) - if search_engine in pym.url.keys(): - tmp += pym.search() - tmp = list(set(tmp)) + + # Check if API credentials are provided + if args.api_key and args.search_engine_id: + Log.info('Using Google Custom Search API for {} file type(s) on "{}"'.format(len(args.file_type), args.domain)) + + for file_type in args.file_type: + pym_api = PyMetaAPI(args.api_key, args.search_engine_id, args.domain, file_type, args.max_results) + tmp += pym_api.search() + tmp = list(set(tmp)) + else: + # Fall back to web scraping + Log.info('Searching {} for {} file type(s) on "{}"'.format(', '.join(args.engine), len(args.file_type), args.domain)) + + for file_type in args.file_type: + for search_engine in args.engine: + pym = PyMeta(search_engine, args.domain, file_type, args.timeout, 3, args.proxy, args.jitter, args.max_results) + if search_engine in pym.url.keys(): + tmp += pym.search() + tmp = list(set(tmp)) dwnld_dir = download_results(args, tmp) extract_exif(dwnld_dir, args.report_file, tmp) @@ -131,6 +152,12 @@ def extract_exif(file_dir, output_file, urls=[]): def main(): try: args = cli() + + # Validate API arguments + if (args.api_key and not args.search_engine_id) or (not args.api_key and args.search_engine_id): + Log.fail("Both --api-key and --search-engine-id must be provided together for API mode") + exit(1) + status(args) exif.exif_check() diff --git a/pymeta/search.py b/pymeta/search.py index 06d0db0..751e026 100644 --- a/pymeta/search.py +++ b/pymeta/search.py @@ -106,6 +106,95 @@ def results_handler(self, link): logging.debug('Added URL: {}'.format(url)) +class PyMetaAPI: + """Google Custom Search API implementation for PyMeta""" + + def __init__(self, api_key, search_engine_id, target, file_type, max_results=50): + self.api_key = api_key + self.search_engine_id = search_engine_id + self.target = target + self.file_type = file_type.lower() + self.max_results = max_results + self.results = [] + self.api_url = "https://www.googleapis.com/customsearch/v1" + + def search(self): + """Search for files using Google Custom Search API""" + start_index = 1 + + while len(self.results) < self.max_results: + try: + # Google API has a limit of 100 results + if start_index > 100: + Log.info('Reached Google API limit of 100 results') + break + + # Build the API request + params = { + 'key': self.api_key, + 'cx': self.search_engine_id, + 'q': f'site:{self.target} filetype:{self.file_type}', + 'start': start_index + } + + logging.debug(f'API Request: {self.api_url} with start={start_index}') + response = requests.get(self.api_url, params=params, timeout=10) + + # Check for API errors + if response.status_code == 429: + Log.warn('API quota limit exceeded! Daily limit reached.') + break + elif response.status_code != 200: + Log.warn(f'API request failed with status code: {response.status_code}') + break + + data = response.json() + + # Check for errors in response + if 'error' in data: + error_msg = data['error'].get('message', 'Unknown error') + Log.warn(f'API Error: {error_msg}') + break + + # Extract file URLs from results + if 'items' in data: + for item in data['items']: + url = item.get('link', '') + if url and self.is_valid_file_url(url): + self.results.append(url) + logging.debug(f'Added URL: {url}') + + Log.info(f"{len(self.results):<3} | {self.file_type:<4} - API search (status: {response.status_code})") + else: + # No more results + logging.debug('No items in API response') + break + + # Check if there are more pages + if 'queries' not in data or 'nextPage' not in data['queries']: + logging.debug('No more pages available') + break + + # Move to next page (Google returns 10 results per page) + start_index += 10 + + # Brief delay between API calls + sleep(0.5) + + except requests.exceptions.RequestException as e: + Log.warn(f'API request error: {e}') + break + except Exception as e: + Log.warn(f'Unexpected error during API search: {e}') + break + + return self.results + + def is_valid_file_url(self, url): + """Check if URL points to the target file type""" + return url.lower().endswith(f'.{self.file_type}') + + def get_statuscode(resp): try: return resp.status_code diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..017a99a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[project] +name = "pymetasec" +version = "1.3.0" +description = "PyMeta is a Python3 rewrite of PowerMeta for metadata extraction from files found via Google and Bing searches." +authors = [ + { name = "m8sec" } +] +readme = "README.md" +license = {text = "GPL-3.0-or-later"} +requires-python = ">=3.6" +classifiers = [ + "Environment :: Console", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Topic :: Security" +] +dependencies = [ + "requests>=2.28.1", + "bs4>=0.0.1", + "tldextract>=3.0.0", + "lxml>=4.9.0" +] + +[project.urls] +Homepage = "https://github.com/m8sec/pymeta" + +[tool.setuptools.packages.find] +where = ["."] +include = ["pymeta*"] +exclude = ["tests*"] + +[tool.setuptools.package-data] +pymeta = ["resources/*"] + +[project.scripts] +pymeta = "pymeta:main" +pymetasec = "pymeta:main" + +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fbf2cc4..05ec0d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,4 @@ -beautifulsoup4>=4.11.1 -bs4>=0.0.1 -certifi>=2022.9.14 -charset-normalizer>=2.1.1 -filelock>=3.8.0 -idna>=3.4 -lxml>=4.9.2 requests>=2.28.1 -requests-file>=1.5.1 -six>=1.16.0 -soupsieve>=2.3.2.post1 -tldextract>=3.3.1 -urllib3>=1.26.12 +bs4>=0.0.1 +tldextract>=3.0.0 +lxml>=4.9.0 diff --git a/setup.py b/setup.py index 2311628..9221fba 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pymetasec', - version='1.2.0', + version='1.3.0', author= 'm8sec', long_description=long_description, long_description_content_type="text/markdown", @@ -16,7 +16,9 @@ package_data={'pymeta': ['resources/*']}, install_requires=[ 'requests>=2.28.1', - 'bs4>=0.0.1' + 'bs4>=0.0.1', + 'tldextract>=3.0.0', + 'lxml>=4.9.0' ], classifiers = [ "Environment :: Console",