From c699d0ebf5acc7f2e751a0d4d0be791cf30e1f75 Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Fri, 27 Mar 2015 11:55:44 -0400
Subject: [PATCH 1/9] Update PDFInfo imports - Added imports for adding child
 objects - re-order existing imports to look nicer

---
 pdfinfo_service/__init__.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index 5e1b219f..dd8431eb 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -1,13 +1,15 @@
 import hashlib
 import logging
-
-from crits.services.core import Service, ServiceConfigError
-
-import pdfparser
-import pdfid
 import math
 import re
 import json
+import pdfparser
+import pdfid
+
+from crits.services.core import Service, ServiceConfigError
+from crits.samples.handlers import handle_file
+
+from . import forms
 
 logger = logging.getLogger(__name__)
 

From 098a5692bafb8eaa682d558686eaea906e267f44 Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Fri, 27 Mar 2015 11:59:59 -0400
Subject: [PATCH 2/9] Render runtime config form. - Prompt user to submit
 suspicious PDF child objects.

---
 pdfinfo_service/__init__.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index dd8431eb..556b4b9a 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -26,6 +26,7 @@ class PDFInfoService(Service):
     version = '1.2.0'
     description = "Extract information from PDF files."
     supported_types = ['Sample']
+    added_files = []
 
     @staticmethod
     def valid_for(obj):
@@ -33,6 +34,26 @@ def valid_for(obj):
         if not obj.is_pdf():
             raise ServiceConfigError("Not a valid PDF.")
 
+    @staticmethod
+    def bind_runtime_form(analyst, config):
+        if 'pdf_objects' not in config:
+            config['pdf_objects'] = False
+        return forms.PDFInfoRunForm(config)
+
+    @classmethod
+    def generate_runtime_form(self, analyst, config, crits_type, identifier):
+        return render_to_string('services_run_form.html',
+                                {'name': self.name,
+                                 'form': forms.PDFInfoRunForm(),
+                                 'crits_type': crits_type,
+                                 'identifier': identifier})
+
+    @staticmethod
+    def get_config(existing_config):
+        # There are no config options for this service, blow away any existing
+        # configs.
+        return {}
+
     def H(self, data):
         """
         Calculate entropy for provided data

From 269cd6f471935cc97545f4bc6c0b7ba49fa9b791 Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Fri, 27 Mar 2015 14:26:37 -0400
Subject: [PATCH 3/9] Testing child files and improvements to object
 identification. - Testing code included for submitting child objects of
 interest - Improvements to identifying object content type.

---
 pdfinfo_service/README      |  4 ++
 pdfinfo_service/__init__.py | 76 ++++++++++++++++++++++++++++++++++++-
 pdfinfo_service/forms.py    | 12 ++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 pdfinfo_service/forms.py

diff --git a/pdfinfo_service/README b/pdfinfo_service/README
index 8cb557ea..414b5f34 100644
--- a/pdfinfo_service/README
+++ b/pdfinfo_service/README
@@ -15,3 +15,7 @@ How to upgrade PDF tools:
             - self.infile = open(file, 'rb')
             + import StringIO
             + self.infile = StringIO.StringIO(file)
+
+TODO:
+- Handle encrypted PDF objects
+    - Example sample: 1e46c60e65ae9f9c9c8850372d8da491
diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index 556b4b9a..3b4af00e 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -6,6 +6,8 @@
 import pdfparser
 import pdfid
 
+from django.template.loader import render_to_string
+
 from crits.services.core import Service, ServiceConfigError
 from crits.samples.handlers import handle_file
 
@@ -123,7 +125,7 @@ def object_search(self, data, search_size=100):
                         (r'js', '/JS\n'),
                         (r'js', '/JS\r\n'),
                         (r'file', '/F\n'),
-                        (r'file', '/F\r\n')]
+                        (r'file', '/F\r\n'),]
 
         #Walk the PDF objects
         while done == False:
@@ -156,10 +158,49 @@ def object_search(self, data, search_size=100):
                                 objects[item[0]].append(str(pdf_object.id))
                             else:
                                 objects[item[0]] = [str(pdf_object.id)]
+                    #Check object type
+                    if pdf_object.GetType() == '/EmbeddedFile':
+                        if objects.get('file'):
+                            objects['file'].append(str(pdf_object.id))
+                        else:
+                            objects['file'] = [str(pdf_object.id)]
+                    
             else:
                 done = True
         return objects
 
+    def add_objects(self, obj_id, reason, data):
+        """
+        Manage the insertion of child objects
+        - Use signatures to filter/inspect embedded files
+            - Fields: title, header, search window size
+        """
+        file_sigs = [('Flash', 'CWS', 50),
+                    ('Flash', 'FWS', 50)]
+        file_sigs_found = False
+
+        #Filter/extract embedded files that are being submitted
+        if reason == 'EmbeddedFile':
+            for sig in file_sigs:
+                search_header = sig[1]
+                search_window = sig[2]
+                offset = stream[:search_window].find(search_header)
+                if offset >= 0:
+                    file_sigs_found = True
+                    reason = '{} ({})'.format(reason, sig[0])
+                    data = data[offset:]
+                    break
+            if file_sigs_found == False:
+                return
+
+        #Add object to addded_files list
+        md5_digest = hashlib.md5(data).hexdigest()
+        self.added_files.append([md5_digest,
+                                obj_id,
+                                len(data),
+                                reason,
+                                data])
+
     def run_pdfparser(self, data):
         """
         Uses pdf-parser to get information for each object.
@@ -223,9 +264,27 @@ def run_pdfparser(self, data):
                     if found_objects.get('js'):
                         if str(pdf_object.id) in found_objects.get('js'):
                             object_content.append('JavaScript')
+                            #Submit JavaScript objects to CRITS
+                            if object_stream:
+                                self.add_objects('{} (stream)'.format(pdf_object.id),
+                                                   'JavaScript',
+                                                   streamContent)
+                            else:
+                                self.add_objects('{}'.format(pdf_object.id),
+                                                   'JavaScript',
+                                                   rawContent)
                     if found_objects.get('file'):
                         if str(pdf_object.id) in found_objects.get('file'):
                             object_content.append('EmbeddedFile')
+                            #Submit (some) embedded files to CRITS
+                            if object_stream:
+                                self.add_objects('{} (stream)'.format(pdf_object.id),
+                                                   'EmbeddedFile',
+                                                   streamContent)
+                            else:
+                                self.add_objects('{}'.format(pdf_object.id),
+                                                   'EmbeddedFile',
+                                                   rawContent)
 
                     result = {
                             "obj_id":           pdf_object.id,
@@ -258,6 +317,21 @@ def run(self, obj, config):
         self.run_pdfid(data)
         self._notify()
         self.run_pdfparser(data)
+        self._notify()
+
+        #Add child objects
+        if config['pdf_objects']:
+            for f in self.added_files:
+                self._info('{} {} {} {}'.format(f[0], f[1], f[2], f[3]))
+                """
+                handle_file(f[0], f[4], obj.source,
+                            related_id=str(obj.id),
+                            campaign=obj.campaign,
+                            method=self.name,
+                            relationship='Extracted_From',
+                            user=self.current_task.username)
+                self._add_result("pdf_objects_added", f[0], {'obj_id':f[1],'size': f[1],'reason': f[3]})
+                """
 
     def _parse_error(self, item, e):
         self._error("Error parsing %s (%s): %s" % (item, e.__class__.__name__, e))
diff --git a/pdfinfo_service/forms.py b/pdfinfo_service/forms.py
new file mode 100644
index 00000000..b57ead96
--- /dev/null
+++ b/pdfinfo_service/forms.py
@@ -0,0 +1,12 @@
+from django import forms
+
+class PDFInfoRunForm(forms.Form):
+    error_css_class = 'error'
+    required_css_class = 'required'
+    pdf_objects = forms.BooleanField(required=False,
+                                  label="Objects",
+                                  help_text="New samples from suspicious PDF objects.",
+                                  initial=True)
+
+    def __init__(self, *args, **kwargs):
+        super(PDFInfoRunForm, self).__init__(*args, **kwargs)

From c657597aa16d57c4e8146a54be9a21d5220c1b78 Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Fri, 27 Mar 2015 16:33:44 -0400
Subject: [PATCH 4/9] Bug fix: incorrectly referencing data

---
 pdfinfo_service/README      | 7 +++++++
 pdfinfo_service/__init__.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/pdfinfo_service/README b/pdfinfo_service/README
index 414b5f34..d0335602 100644
--- a/pdfinfo_service/README
+++ b/pdfinfo_service/README
@@ -17,5 +17,12 @@ How to upgrade PDF tools:
             + self.infile = StringIO.StringIO(file)
 
 TODO:
+- Handle JavaScript objects that point to /Names and don't contain JavaScript
+    - Example sample: 143a09611c45ac34ff0f85cc5efcc2e.pdf
+        - Raw content: << /Names [ (a) 36 0 R (b) 37 0 R (c) 16 0 R (c) 55 0 R ] >>
+    - Example sample: 2a7b8180da2906c9889f13fa912df6a0
+        - Raw content: << /Names [ t 17 0 R ] >>
 - Handle encrypted PDF objects
     - Example sample: 1e46c60e65ae9f9c9c8850372d8da491
+- Look for multiple PDF files concatenated together (PDF/EOF headers)
+    - Example: sample: 1188ea8f0d086a8860a3aafb54a3fa76
diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index 3b4af00e..7c6f7751 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -184,7 +184,7 @@ def add_objects(self, obj_id, reason, data):
             for sig in file_sigs:
                 search_header = sig[1]
                 search_window = sig[2]
-                offset = stream[:search_window].find(search_header)
+                offset = data[:search_window].find(search_header)
                 if offset >= 0:
                     file_sigs_found = True
                     reason = '{} ({})'.format(reason, sig[0])

From 6e6965af7cdec09bfbcc56e3c9827e83d2971df1 Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Fri, 27 Mar 2015 16:54:23 -0400
Subject: [PATCH 5/9] Sort PDFid output - Items now sorted by count.

---
 pdfinfo_service/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index 7c6f7751..ebaf4156 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -96,12 +96,14 @@ def run_pdfid(self, data):
         
         if xml_json_success:
             try:
-                for item in pdfid_dict['pdfid']['keywords']['keyword']:
+                pdf_summary = pdfid_dict['pdfid']['keywords']['keyword']
+                for item in sorted(pdf_summary, key=lambda x: int(x['count']), reverse=True):
                     self._add_result('pdfid', item['name'], {'count':item['count']})
             except KeyError:
                 pass
         else:
-            for count, item in re.findall(r'<Keyword\sCount="([^\"]+)"[^>]+Name=\"([^\"]+)\"',xml_data.toxml()):
+            pdf_summary = re.findall(r'<Keyword\sCount="([^\"]+)"[^>]+Name=\"([^\"]+)\"',xml_data.toxml())
+            for count, item in sorted(pdf_summary, key=lambda x: int(x[0]), reverse=True):
                 self._add_result('pdfid', item, {'count':count})
 
     def object_search(self, data, search_size=100):

From c3ecb6a5005c62c06f974935610b6df1e11a1aef Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Mon, 30 Mar 2015 14:54:18 -0400
Subject: [PATCH 6/9] Add detection using PDFid - Notify user of JavaScript,
 encryption and open actions - Look for uneven counts of obj and endobj

---
 pdfinfo_service/README      |  1 +
 pdfinfo_service/__init__.py | 53 ++++++++++++++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/pdfinfo_service/README b/pdfinfo_service/README
index d0335602..7a01a5af 100644
--- a/pdfinfo_service/README
+++ b/pdfinfo_service/README
@@ -22,6 +22,7 @@ TODO:
         - Raw content: << /Names [ (a) 36 0 R (b) 37 0 R (c) 16 0 R (c) 55 0 R ] >>
     - Example sample: 2a7b8180da2906c9889f13fa912df6a0
         - Raw content: << /Names [ t 17 0 R ] >>
+    - Typically these objects contain one line, the reference.
 - Handle encrypted PDF objects
     - Example sample: 1e46c60e65ae9f9c9c8850372d8da491
 - Look for multiple PDF files concatenated together (PDF/EOF headers)
diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index ebaf4156..81db4c24 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -29,6 +29,7 @@ class PDFInfoService(Service):
     description = "Extract information from PDF files."
     supported_types = ['Sample']
     added_files = []
+    detection = {}
 
     @staticmethod
     def valid_for(obj):
@@ -80,12 +81,29 @@ def _get_pdf_version(self, data):
         else:
             return "0.0"
 
+    def add_detection(self, tag, reason, obj_id=''):
+        """
+        Add a new items to the list of detection results
+        """
+        if tag in self.detection.keys():
+            if not obj_id in self.detection[tag][1]:
+                self.detection[tag][1].append(obj_id)
+        else:
+            self.detection[tag] = [reason, [obj_id]]
+
     def run_pdfid(self, data):
         """
         Uses PDFid to generate stats for the PDF
         - Display keyword matches
         """
         xml_json_success = True
+        javascript = False
+        encrypted = False
+        open_action = False
+        start_obj = 0
+        end_obj = 0
+        results_list = []
+        
 
         xml_data = pdfid.PDFiD(data)
         try:
@@ -98,13 +116,39 @@ def run_pdfid(self, data):
             try:
                 pdf_summary = pdfid_dict['pdfid']['keywords']['keyword']
                 for item in sorted(pdf_summary, key=lambda x: int(x['count']), reverse=True):
-                    self._add_result('pdfid', item['name'], {'count':item['count']})
+                    results_list.append([item['count'], item['name']])
+                    self._add_result('pdfid', item['name'], {'count': item['count']})
             except KeyError:
                 pass
         else:
             pdf_summary = re.findall(r'<Keyword\sCount="([^\"]+)"[^>]+Name=\"([^\"]+)\"',xml_data.toxml())
+            results_list = pdf_summary
             for count, item in sorted(pdf_summary, key=lambda x: int(x[0]), reverse=True):
-                self._add_result('pdfid', item, {'count':count})
+                self._add_result('pdfid', item, {'count': count})
+
+        #Detection rules using PDFid
+        for count, item in results_list:
+            if int(count) > 0:
+                if item == 'obj':
+                    start_obj = count
+                elif item == 'endobj':
+                    end_obj = count
+                elif item == '/JS':
+                    javascript = True
+                elif item == '/JavaScript':
+                    javascript = True
+                elif item == '/Encrypt':
+                    encrypted = True
+                elif item == "/OpenAction":
+                    open_action = True
+        if javascript:
+            self.add_detection('/JavaScript, /JS', 'PDF contains JavaScript.')
+        if encrypted:
+            self.add_detection('/Encrypted', 'PDF contains encrypted content.')
+        if open_action:
+            self.add_detection('/OpenAction', 'PDF performs defined actions when opened.')
+        if not start_obj == end_obj:
+            self.add_detection('obj, endobj', 'PDF contains uneven number of "obj" and "endobj" statements.') 
 
     def object_search(self, data, search_size=100):
         """
@@ -332,8 +376,11 @@ def run(self, obj, config):
                             method=self.name,
                             relationship='Extracted_From',
                             user=self.current_task.username)
-                self._add_result("pdf_objects_added", f[0], {'obj_id':f[1],'size': f[1],'reason': f[3]})
+                self._add_result("pdf_objects_added", f[0], {'obj_id': f[1], 'size': f[1], 'reason': f[3]})
                 """
+        #Add detection items
+        for key, value in self.detection.items():
+            self._add_result("pdf_detection", key, {'description': value[0], 'obj_id(s)': ', '.join(value[1])})
 
     def _parse_error(self, item, e):
         self._error("Error parsing %s (%s): %s" % (item, e.__class__.__name__, e))

From aae712a9162f4d0c76a5455edfa71c75bd8ed63a Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Mon, 30 Mar 2015 15:03:04 -0400
Subject: [PATCH 7/9] Add JavaScript helper functions - JS minimization - JS
 formatting using the jsbeautifier library

---
 pdfinfo_service/DEPENDENCIES |  3 +++
 pdfinfo_service/__init__.py  | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/pdfinfo_service/DEPENDENCIES b/pdfinfo_service/DEPENDENCIES
index e02af7aa..4aaaba43 100644
--- a/pdfinfo_service/DEPENDENCIES
+++ b/pdfinfo_service/DEPENDENCIES
@@ -1,2 +1,5 @@
 PDFInfo leverages the work of Didier Stevens and his pdf-parser. That script
 requires Numpy to run.
+
+PDFInfo also requires jsbeautifier:
+pip install jsbeautifier
diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index 81db4c24..78124fa2 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -5,6 +5,7 @@
 import json
 import pdfparser
 import pdfid
+import jsbeautifier
 
 from django.template.loader import render_to_string
 
@@ -91,6 +92,39 @@ def add_detection(self, tag, reason, obj_id=''):
         else:
             self.detection[tag] = [reason, [obj_id]]
 
+    def js_beautify(self, stream):
+        """
+        Beautify Javascript output
+        """
+        if stream:
+            return str(jsbeautifier.beautify(stream))
+        return
+
+    def js_minimize(self,data):
+        """
+        Very simple JavaScript minimization
+        - Attempt to simplify embedded JavaScript
+            - Remove comments
+            - Remove string escapes (\x20)
+            - Replace formatting
+            - Minimize string logic
+            - Replace dict style references (abc["xyz"] for abc.xyz)
+        """
+        result = None
+        try:
+            result = re.sub(r'//*(.+?)/*/','',data)
+            result = result.decode('string_escape','ignore')
+            result = urllib2.unquote(result)
+            result = result.replace('\\n', '\n')
+            result = result.replace('\\r', '\r')
+            result = result.replace('\\t', '\t')
+            result = result.replace('"+"', '')
+            result = result.replace('\'+\'','')
+            result = re.sub(r'(\w+)\[\"([^\"]+)\"\]', r'\1.\2', result)
+        except Exception:
+            pass
+        return result
+
     def run_pdfid(self, data):
         """
         Uses PDFid to generate stats for the PDF

From d53fc25f5dae6c595088c7917a5991878860f2b9 Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Thu, 2 Apr 2015 09:51:33 -0400
Subject: [PATCH 8/9] PDF Detection techniques and formatting changes - PEP8
 changes - Locate embedded PDF documents - Detect PDF strings of interest -
 Submit child files based on JS/file header detection

---
 pdfinfo_service/README      |   8 -
 pdfinfo_service/__init__.py | 298 +++++++++++++++++++++++++-----------
 2 files changed, 208 insertions(+), 98 deletions(-)

diff --git a/pdfinfo_service/README b/pdfinfo_service/README
index 7a01a5af..414b5f34 100644
--- a/pdfinfo_service/README
+++ b/pdfinfo_service/README
@@ -17,13 +17,5 @@ How to upgrade PDF tools:
             + self.infile = StringIO.StringIO(file)
 
 TODO:
-- Handle JavaScript objects that point to /Names and don't contain JavaScript
-    - Example sample: 143a09611c45ac34ff0f85cc5efcc2e.pdf
-        - Raw content: << /Names [ (a) 36 0 R (b) 37 0 R (c) 16 0 R (c) 55 0 R ] >>
-    - Example sample: 2a7b8180da2906c9889f13fa912df6a0
-        - Raw content: << /Names [ t 17 0 R ] >>
-    - Typically these objects contain one line, the reference.
 - Handle encrypted PDF objects
     - Example sample: 1e46c60e65ae9f9c9c8850372d8da491
-- Look for multiple PDF files concatenated together (PDF/EOF headers)
-    - Example: sample: 1188ea8f0d086a8860a3aafb54a3fa76
diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index 78124fa2..bcb9c0d2 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -25,9 +25,9 @@ class PDFInfoService(Service):
     to scan PDF files, extract metadata and create hashes of each object.
     """
 
-    name = "pdfinfo"
+    name = 'pdfinfo'
     version = '1.2.0'
-    description = "Extract information from PDF files."
+    description = 'Extract information from PDF files.'
     supported_types = ['Sample']
     added_files = []
     detection = {}
@@ -36,7 +36,7 @@ class PDFInfoService(Service):
     def valid_for(obj):
         # Only run on PDF files
         if not obj.is_pdf():
-            raise ServiceConfigError("Not a valid PDF.")
+            raise ServiceConfigError('Not a valid PDF.')
 
     @staticmethod
     def bind_runtime_form(analyst, config):
@@ -82,15 +82,15 @@ def _get_pdf_version(self, data):
         else:
             return "0.0"
 
-    def add_detection(self, tag, reason, obj_id=''):
+    def add_detection(self, desc, obj_id=''):
         """
         Add a new items to the list of detection results
         """
-        if tag in self.detection.keys():
-            if not obj_id in self.detection[tag][1]:
-                self.detection[tag][1].append(obj_id)
+        if desc in self.detection.keys():
+            if obj_id not in self.detection[desc]:
+                self.detection[desc].append(obj_id)
         else:
-            self.detection[tag] = [reason, [obj_id]]
+            self.detection[desc] = [obj_id]
 
     def js_beautify(self, stream):
         """
@@ -100,7 +100,7 @@ def js_beautify(self, stream):
             return str(jsbeautifier.beautify(stream))
         return
 
-    def js_minimize(self,data):
+    def js_minimize(self, data):
         """
         Very simple JavaScript minimization
         - Attempt to simplify embedded JavaScript
@@ -112,14 +112,14 @@ def js_minimize(self,data):
         """
         result = None
         try:
-            result = re.sub(r'//*(.+?)/*/','',data)
-            result = result.decode('string_escape','ignore')
+            result = re.sub(r'//*(.+?)/*/', '', data)
+            result = result.decode('string_escape', 'ignore')
             result = urllib2.unquote(result)
             result = result.replace('\\n', '\n')
             result = result.replace('\\r', '\r')
             result = result.replace('\\t', '\t')
             result = result.replace('"+"', '')
-            result = result.replace('\'+\'','')
+            result = result.replace('\'+\'', '')
             result = re.sub(r'(\w+)\[\"([^\"]+)\"\]', r'\1.\2', result)
         except Exception:
             pass
@@ -137,15 +137,14 @@ def run_pdfid(self, data):
         start_obj = 0
         end_obj = 0
         results_list = []
-        
 
         xml_data = pdfid.PDFiD(data)
         try:
-            json_data = pdfid.PDFiD2JSON(xml_data,'')
+            json_data = pdfid.PDFiD2JSON(xml_data, '')
             pdfid_dict = json.loads(json_data)[0]
         except UnicodeDecodeError:
             xml_json_success = False
-        
+
         if xml_json_success:
             try:
                 pdf_summary = pdfid_dict['pdfid']['keywords']['keyword']
@@ -155,12 +154,12 @@ def run_pdfid(self, data):
             except KeyError:
                 pass
         else:
-            pdf_summary = re.findall(r'<Keyword\sCount="([^\"]+)"[^>]+Name=\"([^\"]+)\"',xml_data.toxml())
+            pdf_summary = re.findall(r'<Keyword\sCount="([^\"]+)"[^>]+Name=\"([^\"]+)\"', xml_data.toxml())
             results_list = pdf_summary
             for count, item in sorted(pdf_summary, key=lambda x: int(x[0]), reverse=True):
                 self._add_result('pdfid', item, {'count': count})
 
-        #Detection rules using PDFid
+        # Detection rules using PDFid
         for count, item in results_list:
             if int(count) > 0:
                 if item == 'obj':
@@ -176,13 +175,13 @@ def run_pdfid(self, data):
                 elif item == "/OpenAction":
                     open_action = True
         if javascript:
-            self.add_detection('/JavaScript, /JS', 'PDF contains JavaScript.')
+            self.add_detection('PDF contains JavaScript.')
         if encrypted:
-            self.add_detection('/Encrypted', 'PDF contains encrypted content.')
+            self.add_detection('PDF contains encrypted content.')
         if open_action:
-            self.add_detection('/OpenAction', 'PDF performs defined actions when opened.')
+            self.add_detection('PDF performs defined actions when opened.')
         if not start_obj == end_obj:
-            self.add_detection('obj, endobj', 'PDF contains uneven number of "obj" and "endobj" statements.') 
+            self.add_detection('PDF contains uneven number of "obj" and "endobj" definitions.')
 
     def object_search(self, data, search_size=100):
         """
@@ -190,65 +189,160 @@ def object_search(self, data, search_size=100):
         @return dictionary containing object types and object id's
         - Use regex and strings to locate PDF tags of interest
 
-        Note: It is important that objects_str definitions do 
+        Note: It is important that objects_str definitions do
             not detect objects found with objects_regex defs.
         """
         oPDFParser = pdfparser.cPDFParser(data)
-        done = False 
+        done = False
         objects = {}
         objects_regex = [(r'js', r'\/JavaScript\s(\d+)\s\d+\sR'),
-                        (r'js', r'\/JS\s(\d+)\s\d+\sR'),
-                        (r'file', r'\/F\s(\d+)\s\d+\sR')]
+                         (r'js', r'\/JS\s(\d+)\s\d+\sR'),
+                         (r'file', r'\/F\s(\d+)\s\d+\sR')]
 
         objects_str = [(r'js', '/JavaScript\n'),
-                        (r'js', '/JavaScript\r\n'),
-                        (r'js', '/JS\n'),
-                        (r'js', '/JS\r\n'),
-                        (r'file', '/F\n'),
-                        (r'file', '/F\r\n'),]
-
-        #Walk the PDF objects
-        while done == False:
+                       (r'js', '/JavaScript\r\n'),
+                       (r'js', '/JS\n'),
+                       (r'js', '/JS\r\n'),
+                       (r'file', '/F\n'),
+                       (r'file', '/F\r\n')]
+
+        # Walk the PDF objects
+        while done is False:
             try:
                 pdf_object = oPDFParser.GetObject()
             except Exception as e:
                 pdf_object = None
 
-            if pdf_object != None:
+            if pdf_object is not None:
                 if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
-                    #See if this PDF object has references to items of interest
+                    # See if this PDF object has references to items of interest
                     rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                     pdf_references = pdf_object.GetReferences()
                     if pdf_references:
-                        #Match getReferences() with objects_regex results
+                        # Match getReferences() with regex search results
                         for item in objects_regex:
-                            matches = re.findall(item[1],rawContent[:search_size])
+                            matches = re.findall(item[1], rawContent[:search_size])
                             for match in matches:
                                 for ref in pdf_references:
-                                    #Record found items
+                                    # Record found items
                                     if match == ref[0]:
                                         if objects.get(item[0]):
                                             objects[item[0]].append(match)
                                         else:
                                             objects[item[0]] = [match]
-                    #Find items within the current object.
+                    # Find items within the current object.
                     for item in objects_str:
                         if pdf_object.Contains(item[1]):
                             if objects.get(item[0]):
                                 objects[item[0]].append(str(pdf_object.id))
                             else:
                                 objects[item[0]] = [str(pdf_object.id)]
-                    #Check object type
+                    # Check object type
                     if pdf_object.GetType() == '/EmbeddedFile':
                         if objects.get('file'):
                             objects['file'].append(str(pdf_object.id))
                         else:
                             objects['file'] = [str(pdf_object.id)]
-                    
             else:
                 done = True
         return objects
 
+    def js_detection(self, obj_id, data):
+        """
+        JavaScript detection techniques
+        - Look for string matches of interest
+        """
+        detection_strings = {
+            '=unescape': 'JavaScript contains a reference to the "unescape()" function.',
+            '= unescape': 'JavaScript contains a reference to the "unescape()" function.',
+            'unescape': 'JavaScript contains a reference to the "unescape()" function.',
+            'eval(': 'JavaScript contains a reference to the "eval()" function.',
+            '=eval': 'JavaScript contains a reference to the "eval()" function.',
+            '= eval': 'JavaScript contains a reference to the "eval()" function.',
+            '.replace': 'JavaScript contains a reference to the "replace" method.',
+            '.substring': 'JavaScript contains a reference to the "substring" method.',
+            '.toString': 'JavaScript contains a reference to the "toString" method.',
+            '.fromCharCode': 'JavaScript contains a reference to the "fromCharCode" method.',
+            '.charCodeAt': 'JavaScript contains a reference to the "charCodeAt" method.',
+            'util.byteToChar': 'JavaScript contains a reference to the "util.byteToChar" function.',
+            '.slice': 'JavaScript contains a reference to the "slice" method.',
+            '.concat': 'JavaScript contains a reference to the "concat" method.',
+            '.length': 'JavaScript contains a reference to the "length" method.',
+            'util.printd': 'JavaScript contains a reference to the "util.printd" function.',
+            'Math.ceil': 'JavaScript contains a reference to the "math.ceil" function.',
+            'app.viewerVersion': 'JavaScript attempts to detect the viewer version.',
+            'app.viewerType': 'JavaScript attempts to detect the viewer type.',
+            'app.setTimeOut': 'JavaScript calls "app.setTimeOut()", this can be used to evaluate code.',
+            'new Array': 'JavaScript creates an array.',
+            'try {} catch(e)': 'JavaScript contains empty exception handling block.',
+            'for(': 'JavaScript contains a for() loop.',
+            'for (': 'JavaScript contains a for() loop.',
+            'while(': 'JavaScript contains a while() loop.',
+            'while ': 'JavaScript contains a while() loop.',
+            'function ': 'JavaScript contains one or more function definitions.',
+            'shellcode': 'JavaScript references a suspicious variable name.',
+            '(sc)': 'JavaScript references a suspicious variable name.',
+            'sc.': 'JavaScript references a suspicious variable name.',
+            ' sc ': 'JavaScript references a suspicious variable name.',
+            'sc+': 'JavaScript references a suspicious variable name.',
+            'sc +': 'JavaScript references a suspicious variable name.',
+            'sc=': 'JavaScript references a suspicious variable name.',
+            'sc =': 'JavaScript references a suspicious variable name.',
+            'var nop': 'JavaScript references a suspicious variable name.',
+            'nop=': 'JavaScript references a suspicious variable name.',
+            'nop =': 'JavaScript references a suspicious variable name.',
+            'nop+': 'JavaScript references a suspicious variable name.',
+            'nop +': 'JavaScript references a suspicious variable name.',
+            'sled =': 'JavaScript references a suspicious variable name.',
+            'sled=': 'JavaScript references a suspicious variable name.',
+            'sled+': 'JavaScript references a suspicious variable name.',
+            'sled +': 'JavaScript references a suspicious variable name.',
+            'nopsled': 'JavaScript references a suspicious variable name.',
+            '\\x90\\x90\\x90\\x90': 'JavaScript references a suspicious byte sequence.',
+            'heap+': 'JavaScript references a suspicious variable name.',
+            'heap +': 'JavaScript references a suspicious variable name.',
+            'heap=': 'JavaScript references a suspicious variable name.',
+            'heap =': 'JavaScript references a suspicious variable name.',
+            '_heap': 'JavaScript references a suspicious variable name.',
+            'heapspray': 'JavaScript references a suspicious variable name.',
+            'var rop': 'JavaScript references a suspicious variable name.',
+            'rop_': 'JavaScript references a suspicious variable name.',
+            'rop=': 'JavaScript references a suspicious variable name.',
+            'rop =': 'JavaScript references a suspicious variable name.',
+            'rop+': 'JavaScript references a suspicious variable name.',
+            'rop +': 'JavaScript references a suspicious variable name.',
+            'payload': 'JavaScript references a suspicious variable name.',
+            'mem+': 'JavaScript references a suspicious variable name.',
+            'mem=': 'JavaScript references a suspicious variable name.',
+            'memory': 'JavaScript references a suspicious variable name.',
+            'exploit': 'JavaScript references a suspicious variable name.',
+            'util.printf': 'JavaScript makes a suspicious call to the "util.printf" function (e.g. CVE-2008-2992).',
+            'collab["GetIcon"]': 'JavaScript makes a suspicious call to the "collab.GetIcon" function (e.g. CVE-2009-0927).',
+            'collab.GetIcon': 'JavaScript makes a suspicious call to the "collab.GetIcon" function (e.g. CVE-2009-0927).',
+            'doc["printSeps"]': 'JavaScript makes a suspicious call to the "doc.printSeps" function (e.g. CVE-2010-4091).',
+            'doc.printSeps': 'JavaScript makes a suspicious call to the "doc.printSeps" function (e.g. CVE-2010-4091).',
+            'media["newPlayer"]': 'JavaScript makes a suspicious call to the "media.newplayer" function (e.g. CVE-2009-4324).',
+            'media.newPlayer': 'JavaScript makes a suspicious call to the "media.newplayer" function (e.g. CVE-2009-4324).',
+            'CoolType.SING.uniqueName': 'JavaScript uses a known vulnerable compact font format object (e.g. CVE-2010-2883)',
+            '.rawValue': 'JavaScript makes a suspicious call to the "rawValue" method (e.g. CVE-2010-0188).',
+            'app.addToolButton': 'JavaScript contains ToolButton (e.g. CVE-2014-0496, CVE-2013-3346).',
+            'app.removeToolButton': 'JavaScript contains ToolButton (e.g. CVE-2014-0496, CVE-2013-3346).',
+            'spell.customDictionaryOpen': 'JavaScript makes a suspicious call to the "spell.customDictionaryOpen" function (e.g. CVE-2009-1493).',
+            '.keep.previous = "contentArea"': 'JavaScript modifies ".keep.previous" property (e.g. CVE-2013-0640).',
+            'collab.collectEmailInfo': 'JavaScript makes a suspicious call to the "collab.collectEmailInfo" function (e.g. CVE-2007-5659).',
+            'getAnnots': 'JavaScript makes suspicious use of the "getAnnots" method (e.g. CVE-2009-1492).',
+            '<choiceList>': 'JavaScript contains choiceList object (e.g. CVE-2013-0640).',
+        }
+
+        found = False
+        # Minimize and beautify JS to improve detection
+        data += self.js_beautify(self.js_minimize(data.lower()))
+        for key, value in detection_strings.items():
+            if key.lower() in data:
+                found = True
+                self.add_detection(value, obj_id)
+        return found
+
     def add_objects(self, obj_id, reason, data):
         """
         Manage the insertion of child objects
@@ -256,10 +350,10 @@ def add_objects(self, obj_id, reason, data):
             - Fields: title, header, search window size
         """
         file_sigs = [('Flash', 'CWS', 50),
-                    ('Flash', 'FWS', 50)]
+                     ('Flash', 'FWS', 50)]
         file_sigs_found = False
 
-        #Filter/extract embedded files that are being submitted
+        # Filter/extract embedded files that are being submitted
         if reason == 'EmbeddedFile':
             for sig in file_sigs:
                 search_header = sig[1]
@@ -270,10 +364,14 @@ def add_objects(self, obj_id, reason, data):
                     reason = '{} ({})'.format(reason, sig[0])
                     data = data[offset:]
                     break
-            if file_sigs_found == False:
+            if file_sigs_found is False:
+                return
+        elif reason == 'JavaScript':
+            # Filter/add interesting JavaScript
+            if not self.js_detection(obj_id, data):
                 return
 
-        #Add object to addded_files list
+        # Add object to addded_files list
         md5_digest = hashlib.md5(data).hexdigest()
         self.added_files.append([md5_digest,
                                 obj_id,
@@ -284,45 +382,45 @@ def add_objects(self, obj_id, reason, data):
     def run_pdfparser(self, data):
         """
         Uses pdf-parser to get information for each object.
-        """        
+        """
         oPDFParser = pdfparser.cPDFParser(data)
         done = False
         found_objects = {}
 
-        #Walk the PDF and inspect PDF objects
+        # Walk the PDF and inspect PDF objects
         found_objects = self.object_search(data)
 
-        while done == False:
+        while done is False:
             try:
                 pdf_object = oPDFParser.GetObject()
             except Exception as e:
                 pdf_object = None
 
-            if pdf_object != None:
+            if pdf_object is not None:
                 if pdf_object.type in [pdfparser.PDF_ELEMENT_INDIRECT_OBJECT]:
-                    #Get general information for this PDF object
+                    # Get general information for this PDF object
                     rawContent = pdfparser.FormatOutput(pdf_object.content, True)
                     section_md5_digest = hashlib.md5(rawContent).hexdigest()
                     section_entropy = self.H(rawContent)
                     object_type = pdf_object.GetType()
 
-                    #Access data associated with this PDF object
+                    # Access data associated with this PDF object
                     if pdf_object.ContainsStream():
                         object_stream = True
                         try:
-                            #decompress stream using codec
-                            streamContent = pdf_object.Stream() 
+                            # decompress stream using codec
+                            streamContent = pdf_object.Stream()
                         except Exception as e:
-                            streamContent = "decompress failed."
+                            streamContent = 'decompress failed.'
 
-                        if "decompress failed." in streamContent[:50]:
-                            #Provide raw stream data
+                        if 'decompress failed.' in streamContent[:50]:
+                            # Provide raw stream data
                             streamContent = pdf_object.Stream('')
 
-                        #Stream returns list of object tags (not actual stream data)
+                        # Stream returns list of object tags (not actual stream data)
                         if type(streamContent) == list:
                             streamContent = pdfparser.FormatOutput(pdf_object.content, True)
-                            #Inspect pdf_object.content and extract raw stream
+                            # Inspect pdf_object.content and extract raw stream
                             stream_start = streamContent.find('stream') + len('stream')
                             stream_end = streamContent.rfind('endstream')
                             if stream_start >= 0 and stream_end > 0:
@@ -333,54 +431,78 @@ def run_pdfparser(self, data):
                         object_stream = False
                         stream_md5_digest = ''
 
-                    #Collect references between this object and others
+                    # Collect references between this object and others
                     object_references = []
                     for reference in pdf_object.GetReferences():
                         object_references.append(reference[0])
                     object_references = ','.join(object_references)
 
-                    #Get results from the object searching
+                    # Get results from the object searching
                     object_content = []
                     if found_objects.get('js'):
                         if str(pdf_object.id) in found_objects.get('js'):
                             object_content.append('JavaScript')
-                            #Submit JavaScript objects to CRITS
+                            # Pass JavaScript to add_objects for further analysis
                             if object_stream:
                                 self.add_objects('{} (stream)'.format(pdf_object.id),
-                                                   'JavaScript',
-                                                   streamContent)
+                                                 'JavaScript',
+                                                 streamContent)
                             else:
                                 self.add_objects('{}'.format(pdf_object.id),
-                                                   'JavaScript',
-                                                   rawContent)
+                                                 'JavaScript',
+                                                 rawContent)
                     if found_objects.get('file'):
                         if str(pdf_object.id) in found_objects.get('file'):
                             object_content.append('EmbeddedFile')
-                            #Submit (some) embedded files to CRITS
+                            # Pass embedded files to add_objects for further analysis
                             if object_stream:
                                 self.add_objects('{} (stream)'.format(pdf_object.id),
-                                                   'EmbeddedFile',
-                                                   streamContent)
+                                                 'EmbeddedFile',
+                                                 streamContent)
                             else:
                                 self.add_objects('{}'.format(pdf_object.id),
-                                                   'EmbeddedFile',
-                                                   rawContent)
+                                                 'EmbeddedFile',
+                                                 rawContent)
 
                     result = {
-                            "obj_id":           pdf_object.id,
-                            "obj_version":      pdf_object.version,
-                            "size":             len(rawContent),
-                            "type":             object_type,
-                            "entropy":          section_entropy,
-                            "content":          ','.join(object_content),
-                            "x_refs":           object_references,
-                            "stream":           object_stream,
-                            "stream_md5":       stream_md5_digest,
+                            'obj_id':           pdf_object.id,
+                            'obj_version':      pdf_object.version,
+                            'size':             len(rawContent),
+                            'type':             object_type,
+                            'entropy':          section_entropy,
+                            'content':          ','.join(object_content),
+                            'x_refs':           object_references,
+                            'stream':           object_stream,
+                            'stream_md5':       stream_md5_digest,
                     }
                     self._add_result('pdf_parser', section_md5_digest, result)
             else:
                 done = True
 
+    def run_pdfheaders(self, data):
+        """
+        Search for multiple PDF headers
+        """
+        header = '%PDF'
+        footers = ['%EOF\x0d', '%EOF\x0a']
+        found = False
+
+        if data.count(header) > 1:
+            for footer in footers:
+                # Reverse searching for embedded PDFs
+                end = data.rfind(footer)
+                while end >= 0:
+                    start = data.rfind(header, 0, end)
+                    end += len(footer)
+                    if start < 1:
+                        # Break if not found or known sample header.
+                        break
+                    self.add_detection('PDF contains embedded PDF document at offset {} to {}.'.format(start, end))
+                    found = True
+                    end = data.rfind(footer, 0, start)
+        if found:
+            self._info('PDFInfo analysis may contain duplicate object ids due to the presence of an embedded PDF.')
+
     def run(self, obj, config):
         """
         Run PDF service
@@ -398,23 +520,19 @@ def run(self, obj, config):
         self._notify()
         self.run_pdfparser(data)
         self._notify()
+        self.run_pdfheaders(data)
+
+        # Add detection items
+        for key, value in sorted(self.detection.items(), reverse=True):
+            self._add_result('pdf_detection', key, {'obj_id(s)': ', '.join(value)})
 
-        #Add child objects
+        # Add child objects
         if config['pdf_objects']:
             for f in self.added_files:
-                self._info('{} {} {} {}'.format(f[0], f[1], f[2], f[3]))
-                """
                 handle_file(f[0], f[4], obj.source,
                             related_id=str(obj.id),
                             campaign=obj.campaign,
                             method=self.name,
                             relationship='Extracted_From',
                             user=self.current_task.username)
-                self._add_result("pdf_objects_added", f[0], {'obj_id': f[1], 'size': f[1], 'reason': f[3]})
-                """
-        #Add detection items
-        for key, value in self.detection.items():
-            self._add_result("pdf_detection", key, {'description': value[0], 'obj_id(s)': ', '.join(value[1])})
-
-    def _parse_error(self, item, e):
-        self._error("Error parsing %s (%s): %s" % (item, e.__class__.__name__, e))
+                self._add_result('pdf_objects_added', f[0], {'obj_id': f[1], 'size': f[1], 'reason': f[3]})

From ff7246958da27014ffdb3c26aefb72c2eb3c0716 Mon Sep 17 00:00:00 2001
From: Wootski <wootski@users.noreply.github.com>
Date: Fri, 3 Apr 2015 12:19:35 -0400
Subject: [PATCH 9/9] Additional header for compressed Flash files.

---
 pdfinfo_service/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pdfinfo_service/__init__.py b/pdfinfo_service/__init__.py
index bcb9c0d2..03b97a98 100644
--- a/pdfinfo_service/__init__.py
+++ b/pdfinfo_service/__init__.py
@@ -350,7 +350,8 @@ def add_objects(self, obj_id, reason, data):
             - Fields: title, header, search window size
         """
         file_sigs = [('Flash', 'CWS', 50),
-                     ('Flash', 'FWS', 50)]
+                     ('Flash', 'FWS', 50),
+                     ('Flash', 'ZWS', 50)]
         file_sigs_found = False
 
         # Filter/extract embedded files that are being submitted