Auto-update: Wed Nov 13 09:17:05 PST 2024

2024-11-13 09:17:05 -08:00 · 2024-11-13 09:17:05 -08:00 · fba9a8cf28
commit fba9a8cf28
parent 037f9c2987
1 changed files with 104 additions and 0 deletions
--- a/104
+++ b/104
@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+from pathlib import Path
+from pdf2image import convert_from_path  # This is the correct import
+import easyocr
+from PyPDF2 import PdfReader, PdfWriter
+import concurrent.futures
+import argparse
+from tqdm import tqdm
+import logging
+
+def setup_logging():
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler(),
+            logging.FileHandler('ocr_process.log')
+        ]
+    )
+
+def extract_images_from_pdf_chunk(pdf_path, start_page, num_pages):
+    try:
+        return convert_from_path(pdf_path,  # This is the correct function name
+                               first_page=start_page, 
+                               last_page=start_page + num_pages - 1,
+                               dpi=300)
+    except Exception as e:
+        logging.error(f"Error extracting pages {start_page}-{start_page+num_pages}: {e}")
+        raise
+
+def process_page(image):
+    reader = easyocr.Reader(['en'], gpu=True)
+    return reader.readtext(image)
+
+def process_chunk(pdf_path, start_page, num_pages):
+    images = extract_images_from_pdf_chunk(pdf_path, start_page, num_pages)
+    results = []
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = [executor.submit(process_page, image) for image in images]
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                results.append(future.result())
+            except Exception as e:
+                logging.error(f"Error processing page: {e}")
+    return results
+
+def main():
+    parser = argparse.ArgumentParser(description='OCR a PDF file using EasyOCR')
+    parser.add_argument('pdf_path', type=str, help='Path to the PDF file')
+    parser.add_argument('--chunk-size', type=int, default=100,
+                        help='Number of pages to process in each chunk')
+    args = parser.parse_args()
+
+    pdf_path = Path(args.pdf_path)
+    if not pdf_path.exists():
+        print(f"Error: File {pdf_path} does not exist")
+        sys.exit(1)
+
+    setup_logging()
+    logging.info(f"Starting OCR process for {pdf_path}")
+
+    # Create output directory
+    output_dir = pdf_path.parent / f"{pdf_path.stem}_ocr_results"
+    output_dir.mkdir(exist_ok=True)
+
+    reader = PdfReader(str(pdf_path))
+    total_pages = len(reader.pages)
+    
+    with tqdm(total=total_pages) as pbar:
+        for start_page in range(1, total_pages + 1, args.chunk_size):
+            chunk_size = min(args.chunk_size, total_pages - start_page + 1)
+            chunk_output = output_dir / f"chunk_{start_page:06d}.txt"
+            
+            if chunk_output.exists():
+                logging.info(f"Skipping existing chunk {start_page}")
+                pbar.update(chunk_size)
+                continue
+
+            try:
+                results = process_chunk(str(pdf_path), start_page, chunk_size)
+                
+                # Save results
+                with open(chunk_output, 'w', encoding='utf-8') as f:
+                    for page_num, page_results in enumerate(results, start_page):
+                        f.write(f"=== Page {page_num} ===\n")
+                        for text_result in page_results:
+                            f.write(f"{text_result[1]}\n")
+                        f.write("\n")
+                
+                pbar.update(chunk_size)
+                logging.info(f"Completed chunk starting at page {start_page}")
+                
+            except Exception as e:
+                logging.error(f"Failed to process chunk starting at page {start_page}: {e}")
+                continue
+
+    logging.info("OCR process complete")
+
+if __name__ == '__main__':
+    main()
+