diff --git a/ocr b/ocr new file mode 100755 index 0000000..937c042 --- /dev/null +++ b/ocr @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +import sys +import os +from pathlib import Path +from pdf2image import convert_from_path # This is the correct import +import easyocr +from PyPDF2 import PdfReader, PdfWriter +import concurrent.futures +import argparse +from tqdm import tqdm +import logging + +def setup_logging(): + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), + logging.FileHandler('ocr_process.log') + ] + ) + +def extract_images_from_pdf_chunk(pdf_path, start_page, num_pages): + try: + return convert_from_path(pdf_path, # This is the correct function name + first_page=start_page, + last_page=start_page + num_pages - 1, + dpi=300) + except Exception as e: + logging.error(f"Error extracting pages {start_page}-{start_page+num_pages}: {e}") + raise + +def process_page(image): + reader = easyocr.Reader(['en'], gpu=True) + return reader.readtext(image) + +def process_chunk(pdf_path, start_page, num_pages): + images = extract_images_from_pdf_chunk(pdf_path, start_page, num_pages) + results = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(process_page, image) for image in images] + for future in concurrent.futures.as_completed(futures): + try: + results.append(future.result()) + except Exception as e: + logging.error(f"Error processing page: {e}") + return results + +def main(): + parser = argparse.ArgumentParser(description='OCR a PDF file using EasyOCR') + parser.add_argument('pdf_path', type=str, help='Path to the PDF file') + parser.add_argument('--chunk-size', type=int, default=100, + help='Number of pages to process in each chunk') + args = parser.parse_args() + + pdf_path = Path(args.pdf_path) + if not pdf_path.exists(): + print(f"Error: File {pdf_path} does not exist") + sys.exit(1) + + setup_logging() + logging.info(f"Starting OCR process for {pdf_path}") + + # Create output directory + output_dir = pdf_path.parent / f"{pdf_path.stem}_ocr_results" + output_dir.mkdir(exist_ok=True) + + reader = PdfReader(str(pdf_path)) + total_pages = len(reader.pages) + + with tqdm(total=total_pages) as pbar: + for start_page in range(1, total_pages + 1, args.chunk_size): + chunk_size = min(args.chunk_size, total_pages - start_page + 1) + chunk_output = output_dir / f"chunk_{start_page:06d}.txt" + + if chunk_output.exists(): + logging.info(f"Skipping existing chunk {start_page}") + pbar.update(chunk_size) + continue + + try: + results = process_chunk(str(pdf_path), start_page, chunk_size) + + # Save results + with open(chunk_output, 'w', encoding='utf-8') as f: + for page_num, page_results in enumerate(results, start_page): + f.write(f"=== Page {page_num} ===\n") + for text_result in page_results: + f.write(f"{text_result[1]}\n") + f.write("\n") + + pbar.update(chunk_size) + logging.info(f"Completed chunk starting at page {start_page}") + + except Exception as e: + logging.error(f"Failed to process chunk starting at page {start_page}: {e}") + continue + + logging.info("OCR process complete") + +if __name__ == '__main__': + main() +