pathScripts/jpgpdfocr

#!/usr/bin/env python3
"""
jpgpdfocr - Convert JPG images to a searchable PDF using OCR.

This script processes a directory of JPG images, runs OCR (Optical Character Recognition)
on each image using Tesseract, and merges them into a single searchable PDF.

Usage:
    ./jpgpdfocr --input <directory> [--output <file>] [--lang <language>]
                [--threads <num>] [--quiet]

Arguments:
    --input, -i    Directory containing JPG files (required).
    --output, -o   Output PDF filename (default: <input_folder>_searchable.pdf).
    --lang, -l     OCR language (default: 'eng').
    --threads, -t  Number of threads for OCR (default: auto-detect CPU cores).
    --quiet, -q    Suppress output messages.

Dependencies:
    - Python 3
    - PIL (Pillow)
    - pytesseract (Tesseract OCR)
    - PyPDF2
    - concurrent.futures (built-in)
"""

import os
import argparse
import io
from PIL import Image
import pytesseract
import concurrent.futures
import time
import sys
from PyPDF2 import PdfMerger

def process_image_ocr(args):
    """Process a single image with OCR and return the OCR PDF content"""
    img_path, lang, page_num, total_pages = args
    try:
        sys.stdout.write(f"\rProcessing page {page_num}/{total_pages}...")
        sys.stdout.flush()
        with Image.open(img_path) as img:
            pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=lang)
        return pdf
    except Exception as e:
        print(f"\nError processing page {page_num}: {e}")
        return None

def create_searchable_pdf(input_dir, output_pdf, language='eng', threads=None, verbose=True):
    """Create a searchable PDF from images using OCR"""
    start_time = time.time()
    if verbose:
        print(f"Processing images in '{input_dir}' with OCR...")

    # Get sorted image list
    image_files = sorted(
        [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(('.jpg', '.jpeg'))]
    )
    total_pages = len(image_files)

    if not image_files:
        print("No images found!")
        return False

    if verbose:
        print(f"Found {total_pages} images.")

    # Use thread pool for parallel OCR
    if threads is None:
        import multiprocessing
        threads = multiprocessing.cpu_count()

    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        tasks = [(img_path, language, i+1, total_pages) for i, img_path in enumerate(image_files)]
        ocr_pdfs = list(executor.map(process_image_ocr, tasks))

    # Merge PDFs in memory
    if verbose:
        print("\nMerging OCRed pages...")

    merger = PdfMerger()
    for pdf in ocr_pdfs:
        if pdf:
            merger.append(io.BytesIO(pdf))

    with open(output_pdf, "wb") as f:
        merger.write(f)

    elapsed_time = time.time() - start_time
    if verbose:
        print(f"OCR completed in {elapsed_time:.2f} seconds.")
        print(f"Searchable PDF created: {output_pdf}")

    return True

def main():
    parser = argparse.ArgumentParser(description='Create a searchable PDF from JPG files with OCR')
    parser.add_argument('--input', '-i', required=True, help='Directory containing JPG files')
    parser.add_argument('--output', '-o', help='Output PDF filename (default: based on folder name)')
    parser.add_argument('--lang', '-l', default='eng', help='OCR language (default: eng)')
    parser.add_argument('--threads', '-t', type=int, help='Number of threads to use for OCR')
    parser.add_argument('--quiet', '-q', action='store_true', help='Minimize output')

    args = parser.parse_args()

    input_dir = os.path.normpath(args.input)

    # Default output filename based on folder name
    if not args.output:
        folder_name = os.path.basename(input_dir)
        args.output = os.path.join(input_dir, f"{folder_name}_searchable.pdf")

    verbose = not args.quiet

    # Create OCR PDF
    success = create_searchable_pdf(input_dir, args.output, args.lang, args.threads, verbose)

    if not success:
        sys.exit(1)

if __name__ == "__main__":
    main()
No results found.