#!/usr/bin/env python3 """ This script can handle three main scenarios, intelligently determining what to do based on the input: 1. A single PDF file → Flatten and re-OCR into a single searchable PDF 2. A folder of numbered image files (e.g., JPG/JPEG) → Combine and OCR into one searchable PDF 3. A folder of PDF files → Batch process each PDF file, flattening and re-OCRing each into its own searchable PDF Usage: ./multi_mode_ocr.py <input_path> [--output <file_or_folder>] [--replace] [--lang <language>] [--threads <num>] [--quiet] Arguments: <input_path> Path to either: (a) a single PDF file, (b) a folder of images, or (c) a folder of PDF files. (Required) --output, -o Desired output path. Interpreted differently depending on input: - Single PDF or folder of images: Output is one PDF file. - Folder of PDFs: Output is a folder containing new OCRed PDFs. By default, if you do NOT use --replace: - For a single PDF: appends "_searchable" before the ".pdf" - For a folder of images: uses "<folder>_searchable.pdf" - For a folder of multiple PDFs: each PDF gets its own "_searchable" appended. --replace, -r Overwrite the original PDF(s) instead of creating a new file (this is only valid if the input is a PDF file or a folder of PDFs). In Single PDF mode, replacement is **default** unless --output is provided. --lang, -l OCR language (default: "eng"). --threads, -t Number of threads to use for OCR (default: auto-detect CPU cores). --quiet, -q Suppress output messages (only errors are printed). Dependencies: - Python 3 - PIL (Pillow) - pytesseract (Tesseract OCR) - PyPDF2 - pdf2image (for flattening PDFs) - concurrent.futures (built-in) """ import os import sys import time import io import argparse import multiprocessing from PIL import Image import pytesseract from PyPDF2 import PdfMerger import concurrent.futures try: from pdf2image import convert_from_path except ImportError: convert_from_path = None def process_image_ocr(args): """Process a single image with OCR and return the OCR PDF bytes.""" img, lang, page_num, total_pages, verbose = args try: if verbose: sys.stdout.write(f"\rProcessing page {page_num}/{total_pages}...") sys.stdout.flush() pdf_bytes = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=lang) return pdf_bytes except Exception as e: print(f"\nError processing page {page_num}: {e}") return None def create_searchable_pdf_from_images(image_list, output_pdf, language='eng', threads=None, verbose=True): """ Create a searchable PDF from a list of PIL Images using OCR and write to output_pdf. """ start_time = time.time() total_pages = len(image_list) if total_pages == 0: if verbose: print("No images found.") return False if threads is None: threads = multiprocessing.cpu_count() tasks = [ (image_list[i], language, i+1, total_pages, verbose) for i in range(total_pages) ] with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: ocr_pdfs = list(executor.map(process_image_ocr, tasks)) if verbose: print("\nMerging OCRed pages...") merger = PdfMerger() for pdf_page in ocr_pdfs: if pdf_page: merger.append(io.BytesIO(pdf_page)) try: with open(output_pdf, "wb") as f: merger.write(f) except Exception as e: print(f"Failed to write output PDF: {e}") return False elapsed_time = time.time() - start_time if verbose: print(f"OCR completed in {elapsed_time:.2f} seconds.") print(f"Searchable PDF created: {output_pdf}") return True def create_searchable_pdf_from_directory_of_images( input_dir, output_pdf, language='eng', threads=None, verbose=True ): """ Collect all JPG/JPEG images in input_dir, sort them, and create a single searchable PDF. """ if verbose: print(f"Processing images in folder '{input_dir}'...") # Collect all .jpg or .jpeg image_files = sorted([ os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(('.jpg', '.jpeg')) ]) pil_images = [] for img_path in image_files: try: pil_images.append(Image.open(img_path)) except Exception as e: if verbose: print(f"Skipping {img_path} due to error: {e}") if verbose: print(f"Found {len(pil_images)} images to process.") return create_searchable_pdf_from_images( pil_images, output_pdf, language, threads, verbose ) def flatten_pdf_to_images(input_pdf, dpi=300): """ Convert each page of a PDF to a list of PIL Images using pdf2image. Returns a list of PIL Images. """ if convert_from_path is None: raise RuntimeError("pdf2image is not installed. Cannot flatten PDFs.") return convert_from_path(input_pdf, dpi=dpi) def flatten_and_ocr_pdf(input_pdf, output_pdf, language='eng', threads=None, verbose=True): """ Flatten an existing PDF to images, then re-OCR into a new searchable PDF. """ if verbose: print(f"Flattening PDF '{input_pdf}' at 300 dpi...") try: pil_images = flatten_pdf_to_images(input_pdf, dpi=300) except Exception as e: print(f"Failed to convert PDF to images: {e}") return False if verbose: print(f"PDF has {len(pil_images)} pages. Starting OCR...") return create_searchable_pdf_from_images( pil_images, output_pdf, language=language, threads=threads, verbose=verbose ) def batch_flatten_and_ocr_pdfs(pdf_files, output_folder, language='eng', threads=None, replace=False, verbose=True): """ Batch-process a list of PDFs: flatten and re-OCR each. - If replace=True, overwrites each original PDF - Otherwise, outputs to output_folder each with "_searchable" appended """ if not pdf_files: if verbose: print("No PDF files found in the folder.") return False if verbose: print(f"Found {len(pdf_files)} PDFs to process.") success = True for pdf_path in pdf_files: base_name = os.path.splitext(os.path.basename(pdf_path))[0] if replace: # Overwrite the original out_path = pdf_path else: out_path = os.path.join( output_folder, f"{base_name}_searchable.pdf" ) if not flatten_and_ocr_pdf(pdf_path, out_path, language, threads, verbose): success = False return success def determine_input_mode(input_path, verbose=True): """ Determine which of the three modes we're in: 1) Single PDF file 2) Folder of images 3) Folder of PDFs Returns a tuple: (mode, items) where "mode" is one of "single_pdf", "folder_images", "folder_pdfs" or None if it can't be determined properly. """ if os.path.isfile(input_path): # If single file, check if it's a PDF if input_path.lower().endswith('.pdf'): return ('single_pdf', input_path) else: # Could be a single image, but the user scenario mentions a "folder of images" # so we'll not handle single-image logic. We'll just treat this as an error: if verbose: print("ERROR: Single file is not a PDF. Exiting.") return (None, None) elif os.path.isdir(input_path): # Possibly a folder of images or a folder of PDFs # Let's see what's inside all_files = os.listdir(input_path) pdf_files = [ os.path.join(input_path, f) for f in all_files if f.lower().endswith('.pdf') ] image_files = [ os.path.join(input_path, f) for f in all_files if f.lower().endswith(('.jpg', '.jpeg')) ] if len(pdf_files) > 0 and len(image_files) == 0: # There's at least one PDF and no images → folder of PDFs if len(pdf_files) == 1: # Edge case: exactly one PDF in the folder. Treat as single_pdf. return ('single_pdf', pdf_files[0]) else: return ('folder_pdfs', pdf_files) elif len(pdf_files) == 0 and len(image_files) > 0: # It's likely a folder of images return ('folder_images', image_files) else: # Mixed or empty # If there's at least one image and no PDFs, we do folder_images. # If there's at least one PDF and no images, we do folder_pdfs. # If there's a mixture or nothing, handle or raise an error. if len(pdf_files) > 0 and len(image_files) > 0: if verbose: print("ERROR: The folder contains both images and PDFs. " "Please separate them or specify a single PDF file.") return (None, None) if len(pdf_files) == 0 and len(image_files) == 0: if verbose: print("ERROR: The folder is empty or doesn't contain PDFs or JPGs.") return (None, None) else: if verbose: print("ERROR: Input path is neither a file nor a folder.") return (None, None) def main(): parser = argparse.ArgumentParser( description="Create a searchable PDF from either: " "a PDF file (flatten, re-OCR), " "a folder of numbered image files, " "or a folder of PDF files (batch)." ) # Change input to a positional argument parser.add_argument('input_path', help='Path to file/folder input (PDF file, folder of images, or folder of PDFs).') parser.add_argument('--output', '-o', help='Output path. Interpretation depends on input: ' 'single file/folder-of-images => single PDF file, ' 'folder-of-pdfs => output folder for new PDFs. ' 'Default: appends "_searchable" to new PDFs if not using --replace.') # Modify --replace to have default behavior based on mode parser.add_argument('--replace', '-r', action='store_true', help='Overwrite the original PDF(s). ' 'Only valid if input is PDF(s).') parser.add_argument('--lang', '-l', default='eng', help='OCR language (default: eng)') parser.add_argument('--threads', '-t', type=int, help='Number of OCR threads (default: # of CPU cores).') parser.add_argument('--quiet', '-q', action='store_true', help='Minimize output messages.') args = parser.parse_args() verbose = not args.quiet input_path = os.path.normpath(args.input_path) mode, items = determine_input_mode(input_path, verbose=verbose) if mode is None: sys.exit(1) # an error has already been printed # Initialize replace flag replace = args.replace if mode == 'single_pdf': # items is the path to that single PDF pdf_path = items if args.output: # If --output is provided, do not replace; output to specified path output_pdf = args.output replace = False else: # No --output provided; replace is True by default output_pdf = pdf_path success = flatten_and_ocr_pdf( pdf_path, output_pdf, language=args.lang, threads=args.threads, verbose=verbose ) if not success: sys.exit(1) elif mode == 'folder_images': input_dir = input_path # There's no concept of replace for images → ignore if user set --replace if args.replace: if verbose: print("Warning: --replace has no effect for folder-of-images input.") if not args.output: # By default, produce "<folder>_searchable.pdf" folder_name = os.path.basename(os.path.normpath(input_dir)) output_pdf = os.path.join(input_dir, f"{folder_name}_searchable.pdf") else: output_pdf = args.output success = create_searchable_pdf_from_directory_of_images( input_dir, output_pdf, language=args.lang, threads=args.threads, verbose=verbose ) if not success: sys.exit(1) elif mode == 'folder_pdfs': # items is the list of PDF files pdf_files = items # If there's only one PDF in the folder, we treat it as single_pdf above. # Here, mode is 'folder_pdfs' only if multiple PDFs exist. if len(pdf_files) == 0: if verbose: print("No PDFs found in folder.") sys.exit(1) if replace: # Overwrite each PDF in place, ignore --output success = batch_flatten_and_ocr_pdfs( pdf_files, output_folder=None, language=args.lang, threads=args.threads, replace=True, verbose=verbose ) if not success: sys.exit(1) else: # Need an output folder if not args.output: # By default, create a subfolder next to the input folder # named something like "OCRed_PDFs" base_dir = input_path output_folder = os.path.join(base_dir, "OCRed_PDFs") if verbose: print(f"No output folder specified; using '{output_folder}'.") else: output_folder = os.path.normpath(args.output) # Create output folder if it doesn't exist if not os.path.exists(output_folder): try: os.makedirs(output_folder, exist_ok=True) except Exception as e: print(f"ERROR: Could not create output folder: {e}") sys.exit(1) success = batch_flatten_and_ocr_pdfs( pdf_files, output_folder=output_folder, language=args.lang, threads=args.threads, replace=False, verbose=verbose ) if not success: sys.exit(1) else: # Shouldn't get here sys.exit(1) if __name__ == "__main__": main()