pathScripts/jpgpdfocr

124 lines
4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
jpgpdfocr - Convert JPG images to a searchable PDF using OCR.
This script processes a directory of JPG images, runs OCR (Optical Character Recognition)
on each image using Tesseract, and merges them into a single searchable PDF.
Usage:
./jpgpdfocr --input <directory> [--output <file>] [--lang <language>]
[--threads <num>] [--quiet]
Arguments:
--input, -i Directory containing JPG files (required).
--output, -o Output PDF filename (default: <input_folder>_searchable.pdf).
--lang, -l OCR language (default: 'eng').
--threads, -t Number of threads for OCR (default: auto-detect CPU cores).
--quiet, -q Suppress output messages.
Dependencies:
- Python 3
- PIL (Pillow)
- pytesseract (Tesseract OCR)
- PyPDF2
- concurrent.futures (built-in)
"""
import os
import argparse
import io
from PIL import Image
import pytesseract
import concurrent.futures
import time
import sys
from PyPDF2 import PdfMerger
def process_image_ocr(args):
"""Process a single image with OCR and return the OCR PDF content"""
img_path, lang, page_num, total_pages = args
try:
sys.stdout.write(f"\rProcessing page {page_num}/{total_pages}...")
sys.stdout.flush()
with Image.open(img_path) as img:
pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=lang)
return pdf
except Exception as e:
print(f"\nError processing page {page_num}: {e}")
return None
def create_searchable_pdf(input_dir, output_pdf, language='eng', threads=None, verbose=True):
"""Create a searchable PDF from images using OCR"""
start_time = time.time()
if verbose:
print(f"Processing images in '{input_dir}' with OCR...")
# Get sorted image list
image_files = sorted(
[os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(('.jpg', '.jpeg'))]
)
total_pages = len(image_files)
if not image_files:
print("No images found!")
return False
if verbose:
print(f"Found {total_pages} images.")
# Use thread pool for parallel OCR
if threads is None:
import multiprocessing
threads = multiprocessing.cpu_count()
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
tasks = [(img_path, language, i+1, total_pages) for i, img_path in enumerate(image_files)]
ocr_pdfs = list(executor.map(process_image_ocr, tasks))
# Merge PDFs in memory
if verbose:
print("\nMerging OCRed pages...")
merger = PdfMerger()
for pdf in ocr_pdfs:
if pdf:
merger.append(io.BytesIO(pdf))
with open(output_pdf, "wb") as f:
merger.write(f)
elapsed_time = time.time() - start_time
if verbose:
print(f"OCR completed in {elapsed_time:.2f} seconds.")
print(f"Searchable PDF created: {output_pdf}")
return True
def main():
parser = argparse.ArgumentParser(description='Create a searchable PDF from JPG files with OCR')
parser.add_argument('--input', '-i', required=True, help='Directory containing JPG files')
parser.add_argument('--output', '-o', help='Output PDF filename (default: based on folder name)')
parser.add_argument('--lang', '-l', default='eng', help='OCR language (default: eng)')
parser.add_argument('--threads', '-t', type=int, help='Number of threads to use for OCR')
parser.add_argument('--quiet', '-q', action='store_true', help='Minimize output')
args = parser.parse_args()
input_dir = os.path.normpath(args.input)
# Default output filename based on folder name
if not args.output:
folder_name = os.path.basename(input_dir)
args.output = os.path.join(input_dir, f"{folder_name}_searchable.pdf")
verbose = not args.quiet
# Create OCR PDF
success = create_searchable_pdf(input_dir, args.output, args.lang, args.threads, verbose)
if not success:
sys.exit(1)
if __name__ == "__main__":
main()