#!/usr/bin/env python3 """ jpgpdfocr - Convert JPG images to a searchable PDF using OCR. This script processes a directory of JPG images, runs OCR (Optical Character Recognition) on each image using Tesseract, and merges them into a single searchable PDF. Usage: ./jpgpdfocr --input <directory> [--output <file>] [--lang <language>] [--threads <num>] [--quiet] Arguments: --input, -i Directory containing JPG files (required). --output, -o Output PDF filename (default: <input_folder>_searchable.pdf). --lang, -l OCR language (default: 'eng'). --threads, -t Number of threads for OCR (default: auto-detect CPU cores). --quiet, -q Suppress output messages. Dependencies: - Python 3 - PIL (Pillow) - pytesseract (Tesseract OCR) - PyPDF2 - concurrent.futures (built-in) """ import os import argparse import io from PIL import Image import pytesseract import concurrent.futures import time import sys from PyPDF2 import PdfMerger def process_image_ocr(args): """Process a single image with OCR and return the OCR PDF content""" img_path, lang, page_num, total_pages = args try: sys.stdout.write(f"\rProcessing page {page_num}/{total_pages}...") sys.stdout.flush() with Image.open(img_path) as img: pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=lang) return pdf except Exception as e: print(f"\nError processing page {page_num}: {e}") return None def create_searchable_pdf(input_dir, output_pdf, language='eng', threads=None, verbose=True): """Create a searchable PDF from images using OCR""" start_time = time.time() if verbose: print(f"Processing images in '{input_dir}' with OCR...") # Get sorted image list image_files = sorted( [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(('.jpg', '.jpeg'))] ) total_pages = len(image_files) if not image_files: print("No images found!") return False if verbose: print(f"Found {total_pages} images.") # Use thread pool for parallel OCR if threads is None: import multiprocessing threads = multiprocessing.cpu_count() with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: tasks = [(img_path, language, i+1, total_pages) for i, img_path in enumerate(image_files)] ocr_pdfs = list(executor.map(process_image_ocr, tasks)) # Merge PDFs in memory if verbose: print("\nMerging OCRed pages...") merger = PdfMerger() for pdf in ocr_pdfs: if pdf: merger.append(io.BytesIO(pdf)) with open(output_pdf, "wb") as f: merger.write(f) elapsed_time = time.time() - start_time if verbose: print(f"OCR completed in {elapsed_time:.2f} seconds.") print(f"Searchable PDF created: {output_pdf}") return True def main(): parser = argparse.ArgumentParser(description='Create a searchable PDF from JPG files with OCR') parser.add_argument('--input', '-i', required=True, help='Directory containing JPG files') parser.add_argument('--output', '-o', help='Output PDF filename (default: based on folder name)') parser.add_argument('--lang', '-l', default='eng', help='OCR language (default: eng)') parser.add_argument('--threads', '-t', type=int, help='Number of threads to use for OCR') parser.add_argument('--quiet', '-q', action='store_true', help='Minimize output') args = parser.parse_args() input_dir = os.path.normpath(args.input) # Default output filename based on folder name if not args.output: folder_name = os.path.basename(input_dir) args.output = os.path.join(input_dir, f"{folder_name}_searchable.pdf") verbose = not args.quiet # Create OCR PDF success = create_searchable_pdf(input_dir, args.output, args.lang, args.threads, verbose) if not success: sys.exit(1) if __name__ == "__main__": main()