From c699d0ebf5acc7f2e751a0d4d0be791cf30e1f75 Mon Sep 17 00:00:00 2001 From: Wootski Date: Fri, 27 Mar 2015 11:55:44 -0400 Subject: [PATCH 1/9] Update PDFInfo imports - Added imports for adding child objects - re-order existing imports to look nicer --- pdfinfo_service/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index 5e1b219f..dd8431eb 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -1,13 +1,15 @@ import hashlib import logging - -from crits.services.core import Service, ServiceConfigError - -import pdfparser -import pdfid import math import re import json +import pdfparser +import pdfid + +from crits.services.core import Service, ServiceConfigError +from crits.samples.handlers import handle_file + +from . import forms logger = logging.getLogger(__name__) From 098a5692bafb8eaa682d558686eaea906e267f44 Mon Sep 17 00:00:00 2001 From: Wootski Date: Fri, 27 Mar 2015 11:59:59 -0400 Subject: [PATCH 2/9] Render runtime config form. - Prompt user to submit suspicious PDF child objects. --- pdfinfo_service/__init__.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index dd8431eb..556b4b9a 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -26,6 +26,7 @@ class PDFInfoService(Service): version = '1.2.0' description = "Extract information from PDF files." supported_types = ['Sample'] + added_files = [] @staticmethod def valid_for(obj): @@ -33,6 +34,26 @@ def valid_for(obj): if not obj.is_pdf(): raise ServiceConfigError("Not a valid PDF.") + @staticmethod + def bind_runtime_form(analyst, config): + if 'pdf_objects' not in config: + config['pdf_objects'] = False + return forms.PDFInfoRunForm(config) + + @classmethod + def generate_runtime_form(self, analyst, config, crits_type, identifier): + return render_to_string('services_run_form.html', + {'name': self.name, + 'form': forms.PDFInfoRunForm(), + 'crits_type': crits_type, + 'identifier': identifier}) + + @staticmethod + def get_config(existing_config): + # There are no config options for this service, blow away any existing + # configs. + return {} + def H(self, data): """ Calculate entropy for provided data From 269cd6f471935cc97545f4bc6c0b7ba49fa9b791 Mon Sep 17 00:00:00 2001 From: Wootski Date: Fri, 27 Mar 2015 14:26:37 -0400 Subject: [PATCH 3/9] Testing child files and improvements to object identification. - Testing code included for submitting child objects of interest - Improvements to identifying object content type. --- pdfinfo_service/README | 4 ++ pdfinfo_service/__init__.py | 76 ++++++++++++++++++++++++++++++++++++- pdfinfo_service/forms.py | 12 ++++++ 3 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 pdfinfo_service/forms.py diff --git a/pdfinfo_service/README b/pdfinfo_service/README index 8cb557ea..414b5f34 100644 --- a/pdfinfo_service/README +++ b/pdfinfo_service/README @@ -15,3 +15,7 @@ How to upgrade PDF tools: - self.infile = open(file, 'rb') + import StringIO + self.infile = StringIO.StringIO(file) + +TODO: +- Handle encrypted PDF objects + - Example sample: 1e46c60e65ae9f9c9c8850372d8da491 diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index 556b4b9a..3b4af00e 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -6,6 +6,8 @@ import pdfparser import pdfid +from django.template.loader import render_to_string + from crits.services.core import Service, ServiceConfigError from crits.samples.handlers import handle_file @@ -123,7 +125,7 @@ def object_search(self, data, search_size=100): (r'js', '/JS\n'), (r'js', '/JS\r\n'), (r'file', '/F\n'), - (r'file', '/F\r\n')] + (r'file', '/F\r\n'),] #Walk the PDF objects while done == False: @@ -156,10 +158,49 @@ def object_search(self, data, search_size=100): objects[item[0]].append(str(pdf_object.id)) else: objects[item[0]] = [str(pdf_object.id)] + #Check object type + if pdf_object.GetType() == '/EmbeddedFile': + if objects.get('file'): + objects['file'].append(str(pdf_object.id)) + else: + objects['file'] = [str(pdf_object.id)] + else: done = True return objects + def add_objects(self, obj_id, reason, data): + """ + Manage the insertion of child objects + - Use signatures to filter/inspect embedded files + - Fields: title, header, search window size + """ + file_sigs = [('Flash', 'CWS', 50), + ('Flash', 'FWS', 50)] + file_sigs_found = False + + #Filter/extract embedded files that are being submitted + if reason == 'EmbeddedFile': + for sig in file_sigs: + search_header = sig[1] + search_window = sig[2] + offset = stream[:search_window].find(search_header) + if offset >= 0: + file_sigs_found = True + reason = '{} ({})'.format(reason, sig[0]) + data = data[offset:] + break + if file_sigs_found == False: + return + + #Add object to addded_files list + md5_digest = hashlib.md5(data).hexdigest() + self.added_files.append([md5_digest, + obj_id, + len(data), + reason, + data]) + def run_pdfparser(self, data): """ Uses pdf-parser to get information for each object. @@ -223,9 +264,27 @@ def run_pdfparser(self, data): if found_objects.get('js'): if str(pdf_object.id) in found_objects.get('js'): object_content.append('JavaScript') + #Submit JavaScript objects to CRITS + if object_stream: + self.add_objects('{} (stream)'.format(pdf_object.id), + 'JavaScript', + streamContent) + else: + self.add_objects('{}'.format(pdf_object.id), + 'JavaScript', + rawContent) if found_objects.get('file'): if str(pdf_object.id) in found_objects.get('file'): object_content.append('EmbeddedFile') + #Submit (some) embedded files to CRITS + if object_stream: + self.add_objects('{} (stream)'.format(pdf_object.id), + 'EmbeddedFile', + streamContent) + else: + self.add_objects('{}'.format(pdf_object.id), + 'EmbeddedFile', + rawContent) result = { "obj_id": pdf_object.id, @@ -258,6 +317,21 @@ def run(self, obj, config): self.run_pdfid(data) self._notify() self.run_pdfparser(data) + self._notify() + + #Add child objects + if config['pdf_objects']: + for f in self.added_files: + self._info('{} {} {} {}'.format(f[0], f[1], f[2], f[3])) + """ + handle_file(f[0], f[4], obj.source, + related_id=str(obj.id), + campaign=obj.campaign, + method=self.name, + relationship='Extracted_From', + user=self.current_task.username) + self._add_result("pdf_objects_added", f[0], {'obj_id':f[1],'size': f[1],'reason': f[3]}) + """ def _parse_error(self, item, e): self._error("Error parsing %s (%s): %s" % (item, e.__class__.__name__, e)) diff --git a/pdfinfo_service/forms.py b/pdfinfo_service/forms.py new file mode 100644 index 00000000..b57ead96 --- /dev/null +++ b/pdfinfo_service/forms.py @@ -0,0 +1,12 @@ +from django import forms + +class PDFInfoRunForm(forms.Form): + error_css_class = 'error' + required_css_class = 'required' + pdf_objects = forms.BooleanField(required=False, + label="Objects", + help_text="New samples from suspicious PDF objects.", + initial=True) + + def __init__(self, *args, **kwargs): + super(PDFInfoRunForm, self).__init__(*args, **kwargs) From c657597aa16d57c4e8146a54be9a21d5220c1b78 Mon Sep 17 00:00:00 2001 From: Wootski Date: Fri, 27 Mar 2015 16:33:44 -0400 Subject: [PATCH 4/9] Bug fix: incorrectly referencing data --- pdfinfo_service/README | 7 +++++++ pdfinfo_service/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pdfinfo_service/README b/pdfinfo_service/README index 414b5f34..d0335602 100644 --- a/pdfinfo_service/README +++ b/pdfinfo_service/README @@ -17,5 +17,12 @@ How to upgrade PDF tools: + self.infile = StringIO.StringIO(file) TODO: +- Handle JavaScript objects that point to /Names and don't contain JavaScript + - Example sample: 143a09611c45ac34ff0f85cc5efcc2e.pdf + - Raw content: << /Names [ (a) 36 0 R (b) 37 0 R (c) 16 0 R (c) 55 0 R ] >> + - Example sample: 2a7b8180da2906c9889f13fa912df6a0 + - Raw content: << /Names [ t 17 0 R ] >> - Handle encrypted PDF objects - Example sample: 1e46c60e65ae9f9c9c8850372d8da491 +- Look for multiple PDF files concatenated together (PDF/EOF headers) + - Example: sample: 1188ea8f0d086a8860a3aafb54a3fa76 diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index 3b4af00e..7c6f7751 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -184,7 +184,7 @@ def add_objects(self, obj_id, reason, data): for sig in file_sigs: search_header = sig[1] search_window = sig[2] - offset = stream[:search_window].find(search_header) + offset = data[:search_window].find(search_header) if offset >= 0: file_sigs_found = True reason = '{} ({})'.format(reason, sig[0]) From 6e6965af7cdec09bfbcc56e3c9827e83d2971df1 Mon Sep 17 00:00:00 2001 From: Wootski Date: Fri, 27 Mar 2015 16:54:23 -0400 Subject: [PATCH 5/9] Sort PDFid output - Items now sorted by count. --- pdfinfo_service/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index 7c6f7751..ebaf4156 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -96,12 +96,14 @@ def run_pdfid(self, data): if xml_json_success: try: - for item in pdfid_dict['pdfid']['keywords']['keyword']: + pdf_summary = pdfid_dict['pdfid']['keywords']['keyword'] + for item in sorted(pdf_summary, key=lambda x: int(x['count']), reverse=True): self._add_result('pdfid', item['name'], {'count':item['count']}) except KeyError: pass else: - for count, item in re.findall(r']+Name=\"([^\"]+)\"',xml_data.toxml()): + pdf_summary = re.findall(r']+Name=\"([^\"]+)\"',xml_data.toxml()) + for count, item in sorted(pdf_summary, key=lambda x: int(x[0]), reverse=True): self._add_result('pdfid', item, {'count':count}) def object_search(self, data, search_size=100): From c3ecb6a5005c62c06f974935610b6df1e11a1aef Mon Sep 17 00:00:00 2001 From: Wootski Date: Mon, 30 Mar 2015 14:54:18 -0400 Subject: [PATCH 6/9] Add detection using PDFid - Notify user of JavaScript, encryption and open actions - Look for uneven counts of obj and endobj --- pdfinfo_service/README | 1 + pdfinfo_service/__init__.py | 53 ++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/pdfinfo_service/README b/pdfinfo_service/README index d0335602..7a01a5af 100644 --- a/pdfinfo_service/README +++ b/pdfinfo_service/README @@ -22,6 +22,7 @@ TODO: - Raw content: << /Names [ (a) 36 0 R (b) 37 0 R (c) 16 0 R (c) 55 0 R ] >> - Example sample: 2a7b8180da2906c9889f13fa912df6a0 - Raw content: << /Names [ t 17 0 R ] >> + - Typically these objects contain one line, the reference. - Handle encrypted PDF objects - Example sample: 1e46c60e65ae9f9c9c8850372d8da491 - Look for multiple PDF files concatenated together (PDF/EOF headers) diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index ebaf4156..81db4c24 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -29,6 +29,7 @@ class PDFInfoService(Service): description = "Extract information from PDF files." supported_types = ['Sample'] added_files = [] + detection = {} @staticmethod def valid_for(obj): @@ -80,12 +81,29 @@ def _get_pdf_version(self, data): else: return "0.0" + def add_detection(self, tag, reason, obj_id=''): + """ + Add a new items to the list of detection results + """ + if tag in self.detection.keys(): + if not obj_id in self.detection[tag][1]: + self.detection[tag][1].append(obj_id) + else: + self.detection[tag] = [reason, [obj_id]] + def run_pdfid(self, data): """ Uses PDFid to generate stats for the PDF - Display keyword matches """ xml_json_success = True + javascript = False + encrypted = False + open_action = False + start_obj = 0 + end_obj = 0 + results_list = [] + xml_data = pdfid.PDFiD(data) try: @@ -98,13 +116,39 @@ def run_pdfid(self, data): try: pdf_summary = pdfid_dict['pdfid']['keywords']['keyword'] for item in sorted(pdf_summary, key=lambda x: int(x['count']), reverse=True): - self._add_result('pdfid', item['name'], {'count':item['count']}) + results_list.append([item['count'], item['name']]) + self._add_result('pdfid', item['name'], {'count': item['count']}) except KeyError: pass else: pdf_summary = re.findall(r']+Name=\"([^\"]+)\"',xml_data.toxml()) + results_list = pdf_summary for count, item in sorted(pdf_summary, key=lambda x: int(x[0]), reverse=True): - self._add_result('pdfid', item, {'count':count}) + self._add_result('pdfid', item, {'count': count}) + + #Detection rules using PDFid + for count, item in results_list: + if int(count) > 0: + if item == 'obj': + start_obj = count + elif item == 'endobj': + end_obj = count + elif item == '/JS': + javascript = True + elif item == '/JavaScript': + javascript = True + elif item == '/Encrypt': + encrypted = True + elif item == "/OpenAction": + open_action = True + if javascript: + self.add_detection('/JavaScript, /JS', 'PDF contains JavaScript.') + if encrypted: + self.add_detection('/Encrypted', 'PDF contains encrypted content.') + if open_action: + self.add_detection('/OpenAction', 'PDF performs defined actions when opened.') + if not start_obj == end_obj: + self.add_detection('obj, endobj', 'PDF contains uneven number of "obj" and "endobj" statements.') def object_search(self, data, search_size=100): """ @@ -332,8 +376,11 @@ def run(self, obj, config): method=self.name, relationship='Extracted_From', user=self.current_task.username) - self._add_result("pdf_objects_added", f[0], {'obj_id':f[1],'size': f[1],'reason': f[3]}) + self._add_result("pdf_objects_added", f[0], {'obj_id': f[1], 'size': f[1], 'reason': f[3]}) """ + #Add detection items + for key, value in self.detection.items(): + self._add_result("pdf_detection", key, {'description': value[0], 'obj_id(s)': ', '.join(value[1])}) def _parse_error(self, item, e): self._error("Error parsing %s (%s): %s" % (item, e.__class__.__name__, e)) From aae712a9162f4d0c76a5455edfa71c75bd8ed63a Mon Sep 17 00:00:00 2001 From: Wootski Date: Mon, 30 Mar 2015 15:03:04 -0400 Subject: [PATCH 7/9] Add JavaScript helper functions - JS minimization - JS formatting using the jsbeautifier library --- pdfinfo_service/DEPENDENCIES | 3 +++ pdfinfo_service/__init__.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/pdfinfo_service/DEPENDENCIES b/pdfinfo_service/DEPENDENCIES index e02af7aa..4aaaba43 100644 --- a/pdfinfo_service/DEPENDENCIES +++ b/pdfinfo_service/DEPENDENCIES @@ -1,2 +1,5 @@ PDFInfo leverages the work of Didier Stevens and his pdf-parser. That script requires Numpy to run. + +PDFInfo also requires jsbeautifier: +pip install jsbeautifier diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index 81db4c24..78124fa2 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -5,6 +5,7 @@ import json import pdfparser import pdfid +import jsbeautifier from django.template.loader import render_to_string @@ -91,6 +92,39 @@ def add_detection(self, tag, reason, obj_id=''): else: self.detection[tag] = [reason, [obj_id]] + def js_beautify(self, stream): + """ + Beautify Javascript output + """ + if stream: + return str(jsbeautifier.beautify(stream)) + return + + def js_minimize(self,data): + """ + Very simple JavaScript minimization + - Attempt to simplify embedded JavaScript + - Remove comments + - Remove string escapes (\x20) + - Replace formatting + - Minimize string logic + - Replace dict style references (abc["xyz"] for abc.xyz) + """ + result = None + try: + result = re.sub(r'//*(.+?)/*/','',data) + result = result.decode('string_escape','ignore') + result = urllib2.unquote(result) + result = result.replace('\\n', '\n') + result = result.replace('\\r', '\r') + result = result.replace('\\t', '\t') + result = result.replace('"+"', '') + result = result.replace('\'+\'','') + result = re.sub(r'(\w+)\[\"([^\"]+)\"\]', r'\1.\2', result) + except Exception: + pass + return result + def run_pdfid(self, data): """ Uses PDFid to generate stats for the PDF From d53fc25f5dae6c595088c7917a5991878860f2b9 Mon Sep 17 00:00:00 2001 From: Wootski Date: Thu, 2 Apr 2015 09:51:33 -0400 Subject: [PATCH 8/9] PDF Detection techniques and formatting changes - PEP8 changes - Locate embedded PDF documents - Detect PDF strings of interest - Submit child files based on JS/file header detection --- pdfinfo_service/README | 8 - pdfinfo_service/__init__.py | 298 +++++++++++++++++++++++++----------- 2 files changed, 208 insertions(+), 98 deletions(-) diff --git a/pdfinfo_service/README b/pdfinfo_service/README index 7a01a5af..414b5f34 100644 --- a/pdfinfo_service/README +++ b/pdfinfo_service/README @@ -17,13 +17,5 @@ How to upgrade PDF tools: + self.infile = StringIO.StringIO(file) TODO: -- Handle JavaScript objects that point to /Names and don't contain JavaScript - - Example sample: 143a09611c45ac34ff0f85cc5efcc2e.pdf - - Raw content: << /Names [ (a) 36 0 R (b) 37 0 R (c) 16 0 R (c) 55 0 R ] >> - - Example sample: 2a7b8180da2906c9889f13fa912df6a0 - - Raw content: << /Names [ t 17 0 R ] >> - - Typically these objects contain one line, the reference. - Handle encrypted PDF objects - Example sample: 1e46c60e65ae9f9c9c8850372d8da491 -- Look for multiple PDF files concatenated together (PDF/EOF headers) - - Example: sample: 1188ea8f0d086a8860a3aafb54a3fa76 diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index 78124fa2..bcb9c0d2 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -25,9 +25,9 @@ class PDFInfoService(Service): to scan PDF files, extract metadata and create hashes of each object. """ - name = "pdfinfo" + name = 'pdfinfo' version = '1.2.0' - description = "Extract information from PDF files." + description = 'Extract information from PDF files.' supported_types = ['Sample'] added_files = [] detection = {} @@ -36,7 +36,7 @@ class PDFInfoService(Service): def valid_for(obj): # Only run on PDF files if not obj.is_pdf(): - raise ServiceConfigError("Not a valid PDF.") + raise ServiceConfigError('Not a valid PDF.') @staticmethod def bind_runtime_form(analyst, config): @@ -82,15 +82,15 @@ def _get_pdf_version(self, data): else: return "0.0" - def add_detection(self, tag, reason, obj_id=''): + def add_detection(self, desc, obj_id=''): """ Add a new items to the list of detection results """ - if tag in self.detection.keys(): - if not obj_id in self.detection[tag][1]: - self.detection[tag][1].append(obj_id) + if desc in self.detection.keys(): + if obj_id not in self.detection[desc]: + self.detection[desc].append(obj_id) else: - self.detection[tag] = [reason, [obj_id]] + self.detection[desc] = [obj_id] def js_beautify(self, stream): """ @@ -100,7 +100,7 @@ def js_beautify(self, stream): return str(jsbeautifier.beautify(stream)) return - def js_minimize(self,data): + def js_minimize(self, data): """ Very simple JavaScript minimization - Attempt to simplify embedded JavaScript @@ -112,14 +112,14 @@ def js_minimize(self,data): """ result = None try: - result = re.sub(r'//*(.+?)/*/','',data) - result = result.decode('string_escape','ignore') + result = re.sub(r'//*(.+?)/*/', '', data) + result = result.decode('string_escape', 'ignore') result = urllib2.unquote(result) result = result.replace('\\n', '\n') result = result.replace('\\r', '\r') result = result.replace('\\t', '\t') result = result.replace('"+"', '') - result = result.replace('\'+\'','') + result = result.replace('\'+\'', '') result = re.sub(r'(\w+)\[\"([^\"]+)\"\]', r'\1.\2', result) except Exception: pass @@ -137,15 +137,14 @@ def run_pdfid(self, data): start_obj = 0 end_obj = 0 results_list = [] - xml_data = pdfid.PDFiD(data) try: - json_data = pdfid.PDFiD2JSON(xml_data,'') + json_data = pdfid.PDFiD2JSON(xml_data, '') pdfid_dict = json.loads(json_data)[0] except UnicodeDecodeError: xml_json_success = False - + if xml_json_success: try: pdf_summary = pdfid_dict['pdfid']['keywords']['keyword'] @@ -155,12 +154,12 @@ def run_pdfid(self, data): except KeyError: pass else: - pdf_summary = re.findall(r']+Name=\"([^\"]+)\"',xml_data.toxml()) + pdf_summary = re.findall(r']+Name=\"([^\"]+)\"', xml_data.toxml()) results_list = pdf_summary for count, item in sorted(pdf_summary, key=lambda x: int(x[0]), reverse=True): self._add_result('pdfid', item, {'count': count}) - #Detection rules using PDFid + # Detection rules using PDFid for count, item in results_list: if int(count) > 0: if item == 'obj': @@ -176,13 +175,13 @@ def run_pdfid(self, data): elif item == "/OpenAction": open_action = True if javascript: - self.add_detection('/JavaScript, /JS', 'PDF contains JavaScript.') + self.add_detection('PDF contains JavaScript.') if encrypted: - self.add_detection('/Encrypted', 'PDF contains encrypted content.') + self.add_detection('PDF contains encrypted content.') if open_action: - self.add_detection('/OpenAction', 'PDF performs defined actions when opened.') + self.add_detection('PDF performs defined actions when opened.') if not start_obj == end_obj: - self.add_detection('obj, endobj', 'PDF contains uneven number of "obj" and "endobj" statements.') + self.add_detection('PDF contains uneven number of "obj" and "endobj" definitions.') def object_search(self, data, search_size=100): """ @@ -190,65 +189,160 @@ def object_search(self, data, search_size=100): @return dictionary containing object types and object id's - Use regex and strings to locate PDF tags of interest - Note: It is important that objects_str definitions do + Note: It is important that objects_str definitions do not detect objects found with objects_regex defs. """ oPDFParser = pdfparser.cPDFParser(data) - done = False + done = False objects = {} objects_regex = [(r'js', r'\/JavaScript\s(\d+)\s\d+\sR'), - (r'js', r'\/JS\s(\d+)\s\d+\sR'), - (r'file', r'\/F\s(\d+)\s\d+\sR')] + (r'js', r'\/JS\s(\d+)\s\d+\sR'), + (r'file', r'\/F\s(\d+)\s\d+\sR')] objects_str = [(r'js', '/JavaScript\n'), - (r'js', '/JavaScript\r\n'), - (r'js', '/JS\n'), - (r'js', '/JS\r\n'), - (r'file', '/F\n'), - (r'file', '/F\r\n'),] - - #Walk the PDF objects - while done == False: + (r'js', '/JavaScript\r\n'), + (r'js', '/JS\n'), + (r'js', '/JS\r\n'), + (r'file', '/F\n'), + (r'file', '/F\r\n')] + + # Walk the PDF objects + while done is False: try: pdf_object = oPDFParser.GetObject() except Exception as e: pdf_object = None - if pdf_object != None: + if pdf_object is not None: if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]: - #See if this PDF object has references to items of interest + # See if this PDF object has references to items of interest rawContent = pdfparser.FormatOutput(pdf_object.content, True) pdf_references = pdf_object.GetReferences() if pdf_references: - #Match getReferences() with objects_regex results + # Match getReferences() with regex search results for item in objects_regex: - matches = re.findall(item[1],rawContent[:search_size]) + matches = re.findall(item[1], rawContent[:search_size]) for match in matches: for ref in pdf_references: - #Record found items + # Record found items if match == ref[0]: if objects.get(item[0]): objects[item[0]].append(match) else: objects[item[0]] = [match] - #Find items within the current object. + # Find items within the current object. for item in objects_str: if pdf_object.Contains(item[1]): if objects.get(item[0]): objects[item[0]].append(str(pdf_object.id)) else: objects[item[0]] = [str(pdf_object.id)] - #Check object type + # Check object type if pdf_object.GetType() == '/EmbeddedFile': if objects.get('file'): objects['file'].append(str(pdf_object.id)) else: objects['file'] = [str(pdf_object.id)] - else: done = True return objects + def js_detection(self, obj_id, data): + """ + JavaScript detection techniques + - Look for string matches of interest + """ + detection_strings = { + '=unescape': 'JavaScript contains a reference to the "unescape()" function.', + '= unescape': 'JavaScript contains a reference to the "unescape()" function.', + 'unescape': 'JavaScript contains a reference to the "unescape()" function.', + 'eval(': 'JavaScript contains a reference to the "eval()" function.', + '=eval': 'JavaScript contains a reference to the "eval()" function.', + '= eval': 'JavaScript contains a reference to the "eval()" function.', + '.replace': 'JavaScript contains a reference to the "replace" method.', + '.substring': 'JavaScript contains a reference to the "substring" method.', + '.toString': 'JavaScript contains a reference to the "toString" method.', + '.fromCharCode': 'JavaScript contains a reference to the "fromCharCode" method.', + '.charCodeAt': 'JavaScript contains a reference to the "charCodeAt" method.', + 'util.byteToChar': 'JavaScript contains a reference to the "util.byteToChar" function.', + '.slice': 'JavaScript contains a reference to the "slice" method.', + '.concat': 'JavaScript contains a reference to the "concat" method.', + '.length': 'JavaScript contains a reference to the "length" method.', + 'util.printd': 'JavaScript contains a reference to the "util.printd" function.', + 'Math.ceil': 'JavaScript contains a reference to the "math.ceil" function.', + 'app.viewerVersion': 'JavaScript attempts to detect the viewer version.', + 'app.viewerType': 'JavaScript attempts to detect the viewer type.', + 'app.setTimeOut': 'JavaScript calls "app.setTimeOut()", this can be used to evaluate code.', + 'new Array': 'JavaScript creates an array.', + 'try {} catch(e)': 'JavaScript contains empty exception handling block.', + 'for(': 'JavaScript contains a for() loop.', + 'for (': 'JavaScript contains a for() loop.', + 'while(': 'JavaScript contains a while() loop.', + 'while ': 'JavaScript contains a while() loop.', + 'function ': 'JavaScript contains one or more function definitions.', + 'shellcode': 'JavaScript references a suspicious variable name.', + '(sc)': 'JavaScript references a suspicious variable name.', + 'sc.': 'JavaScript references a suspicious variable name.', + ' sc ': 'JavaScript references a suspicious variable name.', + 'sc+': 'JavaScript references a suspicious variable name.', + 'sc +': 'JavaScript references a suspicious variable name.', + 'sc=': 'JavaScript references a suspicious variable name.', + 'sc =': 'JavaScript references a suspicious variable name.', + 'var nop': 'JavaScript references a suspicious variable name.', + 'nop=': 'JavaScript references a suspicious variable name.', + 'nop =': 'JavaScript references a suspicious variable name.', + 'nop+': 'JavaScript references a suspicious variable name.', + 'nop +': 'JavaScript references a suspicious variable name.', + 'sled =': 'JavaScript references a suspicious variable name.', + 'sled=': 'JavaScript references a suspicious variable name.', + 'sled+': 'JavaScript references a suspicious variable name.', + 'sled +': 'JavaScript references a suspicious variable name.', + 'nopsled': 'JavaScript references a suspicious variable name.', + '\\x90\\x90\\x90\\x90': 'JavaScript references a suspicious byte sequence.', + 'heap+': 'JavaScript references a suspicious variable name.', + 'heap +': 'JavaScript references a suspicious variable name.', + 'heap=': 'JavaScript references a suspicious variable name.', + 'heap =': 'JavaScript references a suspicious variable name.', + '_heap': 'JavaScript references a suspicious variable name.', + 'heapspray': 'JavaScript references a suspicious variable name.', + 'var rop': 'JavaScript references a suspicious variable name.', + 'rop_': 'JavaScript references a suspicious variable name.', + 'rop=': 'JavaScript references a suspicious variable name.', + 'rop =': 'JavaScript references a suspicious variable name.', + 'rop+': 'JavaScript references a suspicious variable name.', + 'rop +': 'JavaScript references a suspicious variable name.', + 'payload': 'JavaScript references a suspicious variable name.', + 'mem+': 'JavaScript references a suspicious variable name.', + 'mem=': 'JavaScript references a suspicious variable name.', + 'memory': 'JavaScript references a suspicious variable name.', + 'exploit': 'JavaScript references a suspicious variable name.', + 'util.printf': 'JavaScript makes a suspicious call to the "util.printf" function (e.g. CVE-2008-2992).', + 'collab["GetIcon"]': 'JavaScript makes a suspicious call to the "collab.GetIcon" function (e.g. CVE-2009-0927).', + 'collab.GetIcon': 'JavaScript makes a suspicious call to the "collab.GetIcon" function (e.g. CVE-2009-0927).', + 'doc["printSeps"]': 'JavaScript makes a suspicious call to the "doc.printSeps" function (e.g. CVE-2010-4091).', + 'doc.printSeps': 'JavaScript makes a suspicious call to the "doc.printSeps" function (e.g. CVE-2010-4091).', + 'media["newPlayer"]': 'JavaScript makes a suspicious call to the "media.newplayer" function (e.g. CVE-2009-4324).', + 'media.newPlayer': 'JavaScript makes a suspicious call to the "media.newplayer" function (e.g. CVE-2009-4324).', + 'CoolType.SING.uniqueName': 'JavaScript uses a known vulnerable compact font format object (e.g. CVE-2010-2883)', + '.rawValue': 'JavaScript makes a suspicious call to the "rawValue" method (e.g. CVE-2010-0188).', + 'app.addToolButton': 'JavaScript contains ToolButton (e.g. CVE-2014-0496, CVE-2013-3346).', + 'app.removeToolButton': 'JavaScript contains ToolButton (e.g. CVE-2014-0496, CVE-2013-3346).', + 'spell.customDictionaryOpen': 'JavaScript makes a suspicious call to the "spell.customDictionaryOpen" function (e.g. CVE-2009-1493).', + '.keep.previous = "contentArea"': 'JavaScript modifies ".keep.previous" property (e.g. CVE-2013-0640).', + 'collab.collectEmailInfo': 'JavaScript makes a suspicious call to the "collab.collectEmailInfo" function (e.g. CVE-2007-5659).', + 'getAnnots': 'JavaScript makes suspicious use of the "getAnnots" method (e.g. CVE-2009-1492).', + '': 'JavaScript contains choiceList object (e.g. CVE-2013-0640).', + } + + found = False + # Minimize and beautify JS to improve detection + data += self.js_beautify(self.js_minimize(data.lower())) + for key, value in detection_strings.items(): + if key.lower() in data: + found = True + self.add_detection(value, obj_id) + return found + def add_objects(self, obj_id, reason, data): """ Manage the insertion of child objects @@ -256,10 +350,10 @@ def add_objects(self, obj_id, reason, data): - Fields: title, header, search window size """ file_sigs = [('Flash', 'CWS', 50), - ('Flash', 'FWS', 50)] + ('Flash', 'FWS', 50)] file_sigs_found = False - #Filter/extract embedded files that are being submitted + # Filter/extract embedded files that are being submitted if reason == 'EmbeddedFile': for sig in file_sigs: search_header = sig[1] @@ -270,10 +364,14 @@ def add_objects(self, obj_id, reason, data): reason = '{} ({})'.format(reason, sig[0]) data = data[offset:] break - if file_sigs_found == False: + if file_sigs_found is False: + return + elif reason == 'JavaScript': + # Filter/add interesting JavaScript + if not self.js_detection(obj_id, data): return - #Add object to addded_files list + # Add object to addded_files list md5_digest = hashlib.md5(data).hexdigest() self.added_files.append([md5_digest, obj_id, @@ -284,45 +382,45 @@ def add_objects(self, obj_id, reason, data): def run_pdfparser(self, data): """ Uses pdf-parser to get information for each object. - """ + """ oPDFParser = pdfparser.cPDFParser(data) done = False found_objects = {} - #Walk the PDF and inspect PDF objects + # Walk the PDF and inspect PDF objects found_objects = self.object_search(data) - while done == False: + while done is False: try: pdf_object = oPDFParser.GetObject() except Exception as e: pdf_object = None - if pdf_object != None: + if pdf_object is not None: if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]: - #Get general information for this PDF object + # Get general information for this PDF object rawContent = pdfparser.FormatOutput(pdf_object.content, True) section_md5_digest = hashlib.md5(rawContent).hexdigest() section_entropy = self.H(rawContent) object_type = pdf_object.GetType() - #Access data associated with this PDF object + # Access data associated with this PDF object if pdf_object.ContainsStream(): object_stream = True try: - #decompress stream using codec - streamContent = pdf_object.Stream() + # decompress stream using codec + streamContent = pdf_object.Stream() except Exception as e: - streamContent = "decompress failed." + streamContent = 'decompress failed.' - if "decompress failed." in streamContent[:50]: - #Provide raw stream data + if 'decompress failed.' in streamContent[:50]: + # Provide raw stream data streamContent = pdf_object.Stream('') - #Stream returns list of object tags (not actual stream data) + # Stream returns list of object tags (not actual stream data) if type(streamContent) == list: streamContent = pdfparser.FormatOutput(pdf_object.content, True) - #Inspect pdf_object.content and extract raw stream + # Inspect pdf_object.content and extract raw stream stream_start = streamContent.find('stream') + len('stream') stream_end = streamContent.rfind('endstream') if stream_start >= 0 and stream_end > 0: @@ -333,54 +431,78 @@ def run_pdfparser(self, data): object_stream = False stream_md5_digest = '' - #Collect references between this object and others + # Collect references between this object and others object_references = [] for reference in pdf_object.GetReferences(): object_references.append(reference[0]) object_references = ','.join(object_references) - #Get results from the object searching + # Get results from the object searching object_content = [] if found_objects.get('js'): if str(pdf_object.id) in found_objects.get('js'): object_content.append('JavaScript') - #Submit JavaScript objects to CRITS + # Pass JavaScript to add_objects for further analysis if object_stream: self.add_objects('{} (stream)'.format(pdf_object.id), - 'JavaScript', - streamContent) + 'JavaScript', + streamContent) else: self.add_objects('{}'.format(pdf_object.id), - 'JavaScript', - rawContent) + 'JavaScript', + rawContent) if found_objects.get('file'): if str(pdf_object.id) in found_objects.get('file'): object_content.append('EmbeddedFile') - #Submit (some) embedded files to CRITS + # Pass embedded files to add_objects for further analysis if object_stream: self.add_objects('{} (stream)'.format(pdf_object.id), - 'EmbeddedFile', - streamContent) + 'EmbeddedFile', + streamContent) else: self.add_objects('{}'.format(pdf_object.id), - 'EmbeddedFile', - rawContent) + 'EmbeddedFile', + rawContent) result = { - "obj_id": pdf_object.id, - "obj_version": pdf_object.version, - "size": len(rawContent), - "type": object_type, - "entropy": section_entropy, - "content": ','.join(object_content), - "x_refs": object_references, - "stream": object_stream, - "stream_md5": stream_md5_digest, + 'obj_id': pdf_object.id, + 'obj_version': pdf_object.version, + 'size': len(rawContent), + 'type': object_type, + 'entropy': section_entropy, + 'content': ','.join(object_content), + 'x_refs': object_references, + 'stream': object_stream, + 'stream_md5': stream_md5_digest, } self._add_result('pdf_parser', section_md5_digest, result) else: done = True + def run_pdfheaders(self, data): + """ + Search for multiple PDF headers + """ + header = '%PDF' + footers = ['%EOF\x0d', '%EOF\x0a'] + found = False + + if data.count(header) > 1: + for footer in footers: + # Reverse searching for embedded PDFs + end = data.rfind(footer) + while end >= 0: + start = data.rfind(header, 0, end) + end += len(footer) + if start < 1: + # Break if not found or known sample header. + break + self.add_detection('PDF contains embedded PDF document at offset {} to {}.'.format(start, end)) + found = True + end = data.rfind(footer, 0, start) + if found: + self._info('PDFInfo analysis may contain duplicate object ids due to the presence of an embedded PDF.') + def run(self, obj, config): """ Run PDF service @@ -398,23 +520,19 @@ def run(self, obj, config): self._notify() self.run_pdfparser(data) self._notify() + self.run_pdfheaders(data) + + # Add detection items + for key, value in sorted(self.detection.items(), reverse=True): + self._add_result('pdf_detection', key, {'obj_id(s)': ', '.join(value)}) - #Add child objects + # Add child objects if config['pdf_objects']: for f in self.added_files: - self._info('{} {} {} {}'.format(f[0], f[1], f[2], f[3])) - """ handle_file(f[0], f[4], obj.source, related_id=str(obj.id), campaign=obj.campaign, method=self.name, relationship='Extracted_From', user=self.current_task.username) - self._add_result("pdf_objects_added", f[0], {'obj_id': f[1], 'size': f[1], 'reason': f[3]}) - """ - #Add detection items - for key, value in self.detection.items(): - self._add_result("pdf_detection", key, {'description': value[0], 'obj_id(s)': ', '.join(value[1])}) - - def _parse_error(self, item, e): - self._error("Error parsing %s (%s): %s" % (item, e.__class__.__name__, e)) + self._add_result('pdf_objects_added', f[0], {'obj_id': f[1], 'size': f[1], 'reason': f[3]}) From ff7246958da27014ffdb3c26aefb72c2eb3c0716 Mon Sep 17 00:00:00 2001 From: Wootski Date: Fri, 3 Apr 2015 12:19:35 -0400 Subject: [PATCH 9/9] Additional header for compressed Flash files. --- pdfinfo_service/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py index bcb9c0d2..03b97a98 100644 --- a/pdfinfo_service/__init__.py +++ b/pdfinfo_service/__init__.py @@ -350,7 +350,8 @@ def add_objects(self, obj_id, reason, data): - Fields: title, header, search window size """ file_sigs = [('Flash', 'CWS', 50), - ('Flash', 'FWS', 50)] + ('Flash', 'FWS', 50), + ('Flash', 'ZWS', 50)] file_sigs_found = False # Filter/extract embedded files that are being submitted