Convert PDFs to PNGs before scanning

sublime-security · Dec 20, 2024 · fd7e960 · fd7e960
1 parent 3af1871
commit fd7e960
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 7 deletions.
diff --git a/build/configs/scanners.yaml b/build/configs/scanners.yaml
@@ -408,13 +408,16 @@ scanners:
           - 'jpeg_file'
           - 'image/png'
           - 'png_file'
-          - 'image/tiff'
+          - 'image/tif'
           - 'type_is_tiff'
           - 'image/x-ms-bmp'
           - 'image/bmp'
           - 'bmp_file'
           - 'image/webp'
+          - 'pdf_file'
       priority: 5
+      options:
+        pdf_to_png: True
   'ScanRar':
     - positive:
         flavors:

diff --git a/configs/python/backend/backend.yaml b/configs/python/backend/backend.yaml
@@ -416,7 +416,10 @@ scanners:
           - 'type_is_tiff'
           - 'image/x-ms-bmp'
           - 'bmp_file'
+          - 'pdf_file'
       priority: 5
+      options:
+        pdf_to_png: True
   'ScanRar':
     - positive:
         flavors:

diff --git a/src/python/strelka/scanners/scan_ocr.py b/src/python/strelka/scanners/scan_ocr.py
@@ -1,8 +1,8 @@
-import fitz
 import os
 import subprocess
 import tempfile
 
+import fitz
 from strelka import strelka
 
 
@@ -16,6 +16,7 @@ class ScanOcr(strelka.Scanner):
         tmp_directory: Location where tempfile writes temporary files.
             Defaults to '/tmp/'.
     """
+
     def scan(self, data, file, options, expire_at):
         extract_text = options.get('extract_text', False)
         tmp_directory = options.get('tmp_directory', '/tmp/')
@@ -34,7 +35,7 @@ def scan(self, data, file, options, expire_at):
                 tess_return = subprocess.call(
                     ['tesseract', tmp_data.name, tmp_tess.name],
                     stdout=subprocess.DEVNULL,
-                    stderr=subprocess.DEVNULL
+                    stderr=subprocess.DEVNULL,
                 )
                 tess_txt_name = f'{tmp_tess.name}.txt'
                 if tess_return == 0:

diff --git a/src/python/strelka/scanners/scan_pdf.py b/src/python/strelka/scanners/scan_pdf.py
@@ -101,8 +101,10 @@ def scan(self, data, file, options, expire_at):
                             for link in links:
                                 if "uri" in link:
                                     self.event["annotated_uris"].append(link["uri"])
-                        if extract_text:
+                        if extract_text and hasattr(page, "getText"):
                             extracted_text += page.getText()
+                        if extract_text and hasattr(page, "get_text"):
+                            extracted_text += page.get_text()
 
                     # PDF Text Extraction
                     # Caution: Will increase time and object storage size

diff --git a/src/python/strelka/scanners/scan_qr.py b/src/python/strelka/scanners/scan_qr.py
@@ -1,8 +1,9 @@
-from pyzbar.pyzbar import decode
-from PIL import Image
 import io
 import re
 
+import fitz
+from PIL import Image
+from pyzbar.pyzbar import decode, ZBarSymbol
 from strelka import strelka
 
 # Regex to match URL
@@ -14,9 +15,18 @@ class ScanQr(strelka.Scanner):
     """
     Collects QR code metadata from image files.
     """
+
     def scan(self, data, file, options, expire_at):
+        pdf_to_png = options.get('pdf_to_png', False)
+
         try:
-            barcodes = decode(Image.open(io.BytesIO(data)))
+            if pdf_to_png and 'application/pdf' in file.flavors.get('mime', []):
+                # TODO: Use fitz builtin OCR support which also wraps tesseract
+                doc = fitz.open(stream=data, filetype='pdf')
+                data = doc.get_page_pixmap(0, dpi=150).tobytes()
+
+            img = Image.open(io.BytesIO(data))
+            barcodes = decode(img, symbols=[ZBarSymbol.QRCODE])
 
             try:
                 self.event['data'] = barcodes[0].data.decode('utf-8')