124 lines
4 KiB
Python
Executable file
124 lines
4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
jpgpdfocr - Convert JPG images to a searchable PDF using OCR.
|
|
|
|
This script processes a directory of JPG images, runs OCR (Optical Character Recognition)
|
|
on each image using Tesseract, and merges them into a single searchable PDF.
|
|
|
|
Usage:
|
|
./jpgpdfocr --input <directory> [--output <file>] [--lang <language>]
|
|
[--threads <num>] [--quiet]
|
|
|
|
Arguments:
|
|
--input, -i Directory containing JPG files (required).
|
|
--output, -o Output PDF filename (default: <input_folder>_searchable.pdf).
|
|
--lang, -l OCR language (default: 'eng').
|
|
--threads, -t Number of threads for OCR (default: auto-detect CPU cores).
|
|
--quiet, -q Suppress output messages.
|
|
|
|
Dependencies:
|
|
- Python 3
|
|
- PIL (Pillow)
|
|
- pytesseract (Tesseract OCR)
|
|
- PyPDF2
|
|
- concurrent.futures (built-in)
|
|
"""
|
|
|
|
import os
|
|
import argparse
|
|
import io
|
|
from PIL import Image
|
|
import pytesseract
|
|
import concurrent.futures
|
|
import time
|
|
import sys
|
|
from PyPDF2 import PdfMerger
|
|
|
|
def process_image_ocr(args):
|
|
"""Process a single image with OCR and return the OCR PDF content"""
|
|
img_path, lang, page_num, total_pages = args
|
|
try:
|
|
sys.stdout.write(f"\rProcessing page {page_num}/{total_pages}...")
|
|
sys.stdout.flush()
|
|
with Image.open(img_path) as img:
|
|
pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=lang)
|
|
return pdf
|
|
except Exception as e:
|
|
print(f"\nError processing page {page_num}: {e}")
|
|
return None
|
|
|
|
def create_searchable_pdf(input_dir, output_pdf, language='eng', threads=None, verbose=True):
|
|
"""Create a searchable PDF from images using OCR"""
|
|
start_time = time.time()
|
|
if verbose:
|
|
print(f"Processing images in '{input_dir}' with OCR...")
|
|
|
|
# Get sorted image list
|
|
image_files = sorted(
|
|
[os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(('.jpg', '.jpeg'))]
|
|
)
|
|
total_pages = len(image_files)
|
|
|
|
if not image_files:
|
|
print("No images found!")
|
|
return False
|
|
|
|
if verbose:
|
|
print(f"Found {total_pages} images.")
|
|
|
|
# Use thread pool for parallel OCR
|
|
if threads is None:
|
|
import multiprocessing
|
|
threads = multiprocessing.cpu_count()
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
|
tasks = [(img_path, language, i+1, total_pages) for i, img_path in enumerate(image_files)]
|
|
ocr_pdfs = list(executor.map(process_image_ocr, tasks))
|
|
|
|
# Merge PDFs in memory
|
|
if verbose:
|
|
print("\nMerging OCRed pages...")
|
|
|
|
merger = PdfMerger()
|
|
for pdf in ocr_pdfs:
|
|
if pdf:
|
|
merger.append(io.BytesIO(pdf))
|
|
|
|
with open(output_pdf, "wb") as f:
|
|
merger.write(f)
|
|
|
|
elapsed_time = time.time() - start_time
|
|
if verbose:
|
|
print(f"OCR completed in {elapsed_time:.2f} seconds.")
|
|
print(f"Searchable PDF created: {output_pdf}")
|
|
|
|
return True
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Create a searchable PDF from JPG files with OCR')
|
|
parser.add_argument('--input', '-i', required=True, help='Directory containing JPG files')
|
|
parser.add_argument('--output', '-o', help='Output PDF filename (default: based on folder name)')
|
|
parser.add_argument('--lang', '-l', default='eng', help='OCR language (default: eng)')
|
|
parser.add_argument('--threads', '-t', type=int, help='Number of threads to use for OCR')
|
|
parser.add_argument('--quiet', '-q', action='store_true', help='Minimize output')
|
|
|
|
args = parser.parse_args()
|
|
|
|
input_dir = os.path.normpath(args.input)
|
|
|
|
# Default output filename based on folder name
|
|
if not args.output:
|
|
folder_name = os.path.basename(input_dir)
|
|
args.output = os.path.join(input_dir, f"{folder_name}_searchable.pdf")
|
|
|
|
verbose = not args.quiet
|
|
|
|
# Create OCR PDF
|
|
success = create_searchable_pdf(input_dir, args.output, args.lang, args.threads, verbose)
|
|
|
|
if not success:
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|