diff --git a/jpgpdfocr b/jpgpdfocr new file mode 100755 index 0000000..4581e73 --- /dev/null +++ b/jpgpdfocr @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +import io +import os +import argparse +from PIL import Image +import pytesseract +import concurrent.futures +import time +import sys +from PyPDF2 import PdfMerger + +def process_image_ocr(args): + """Process a single image with OCR and return the OCR PDF content""" + img_path, lang, page_num, total_pages = args + try: + sys.stdout.write(f"\rProcessing page {page_num}/{total_pages}...") + sys.stdout.flush() + with Image.open(img_path) as img: + pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=lang) + return pdf + except Exception as e: + print(f"\nError processing page {page_num}: {e}") + return None + +def create_searchable_pdf(input_dir, output_pdf, language='eng', threads=None, verbose=True): + """Create a searchable PDF from images using OCR""" + start_time = time.time() + if verbose: + print(f"Processing images in '{input_dir}' with OCR...") + + # Get sorted image list + image_files = sorted( + [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(('.jpg', '.jpeg'))] + ) + total_pages = len(image_files) + + if not image_files: + print("No images found!") + return False + + if verbose: + print(f"Found {total_pages} images.") + + # Use thread pool for parallel OCR + if threads is None: + import multiprocessing + threads = multiprocessing.cpu_count() + + with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: + tasks = [(img_path, language, i+1, total_pages) for i, img_path in enumerate(image_files)] + ocr_pdfs = list(executor.map(process_image_ocr, tasks)) + + # Merge PDFs in memory + if verbose: + print("\nMerging OCRed pages...") + + merger = PdfMerger() + for pdf in ocr_pdfs: + if pdf: + merger.append(io.BytesIO(pdf)) + + with open(output_pdf, "wb") as f: + merger.write(f) + + elapsed_time = time.time() - start_time + if verbose: + print(f"OCR completed in {elapsed_time:.2f} seconds.") + print(f"Searchable PDF created: {output_pdf}") + + return True + +def main(): + parser = argparse.ArgumentParser(description='Create a searchable PDF from JPG files with OCR') + parser.add_argument('--input', '-i', required=True, help='Directory containing JPG files') + parser.add_argument('--output', '-o', help='Output PDF filename (default: based on folder name)') + parser.add_argument('--lang', '-l', default='eng', help='OCR language (default: eng)') + parser.add_argument('--threads', '-t', type=int, help='Number of threads to use for OCR') + parser.add_argument('--quiet', '-q', action='store_true', help='Minimize output') + + args = parser.parse_args() + + input_dir = os.path.normpath(args.input) + + # Default output filename based on folder name + if not args.output: + folder_name = os.path.basename(input_dir) + args.output = os.path.join(input_dir, f"{folder_name}_searchable.pdf") + + verbose = not args.quiet + + # Create OCR PDF + success = create_searchable_pdf(input_dir, args.output, args.lang, args.threads, verbose) + + if not success: + sys.exit(1) + +if __name__ == "__main__": + main() +