#!/usr/bin/env python3 import sys import os from pathlib import Path from pdf2image import convert_from_path # This is the correct import import easyocr from PyPDF2 import PdfReader, PdfWriter import concurrent.futures import argparse from tqdm import tqdm import logging def setup_logging(): logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('ocr_process.log') ] ) def extract_images_from_pdf_chunk(pdf_path, start_page, num_pages): try: return convert_from_path(pdf_path, # This is the correct function name first_page=start_page, last_page=start_page + num_pages - 1, dpi=300) except Exception as e: logging.error(f"Error extracting pages {start_page}-{start_page+num_pages}: {e}") raise def process_page(image): reader = easyocr.Reader(['en'], gpu=True) return reader.readtext(image) def process_chunk(pdf_path, start_page, num_pages): images = extract_images_from_pdf_chunk(pdf_path, start_page, num_pages) results = [] with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(process_page, image) for image in images] for future in concurrent.futures.as_completed(futures): try: results.append(future.result()) except Exception as e: logging.error(f"Error processing page: {e}") return results def main(): parser = argparse.ArgumentParser(description='OCR a PDF file using EasyOCR') parser.add_argument('pdf_path', type=str, help='Path to the PDF file') parser.add_argument('--chunk-size', type=int, default=100, help='Number of pages to process in each chunk') args = parser.parse_args() pdf_path = Path(args.pdf_path) if not pdf_path.exists(): print(f"Error: File {pdf_path} does not exist") sys.exit(1) setup_logging() logging.info(f"Starting OCR process for {pdf_path}") # Create output directory output_dir = pdf_path.parent / f"{pdf_path.stem}_ocr_results" output_dir.mkdir(exist_ok=True) reader = PdfReader(str(pdf_path)) total_pages = len(reader.pages) with tqdm(total=total_pages) as pbar: for start_page in range(1, total_pages + 1, args.chunk_size): chunk_size = min(args.chunk_size, total_pages - start_page + 1) chunk_output = output_dir / f"chunk_{start_page:06d}.txt" if chunk_output.exists(): logging.info(f"Skipping existing chunk {start_page}") pbar.update(chunk_size) continue try: results = process_chunk(str(pdf_path), start_page, chunk_size) # Save results with open(chunk_output, 'w', encoding='utf-8') as f: for page_num, page_results in enumerate(results, start_page): f.write(f"=== Page {page_num} ===\n") for text_result in page_results: f.write(f"{text_result[1]}\n") f.write("\n") pbar.update(chunk_size) logging.info(f"Completed chunk starting at page {start_page}") except Exception as e: logging.error(f"Failed to process chunk starting at page {start_page}: {e}") continue logging.info("OCR process complete") if __name__ == '__main__': main()