pathScripts/ocr

#!/usr/bin/env python3

import sys
import os
from pathlib import Path
from pdf2image import convert_from_path  # This is the correct import
import easyocr
from PyPDF2 import PdfReader, PdfWriter
import concurrent.futures
import argparse
from tqdm import tqdm
import logging

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler('ocr_process.log')
        ]
    )

def extract_images_from_pdf_chunk(pdf_path, start_page, num_pages):
    try:
        return convert_from_path(pdf_path,  # This is the correct function name
                               first_page=start_page, 
                               last_page=start_page + num_pages - 1,
                               dpi=300)
    except Exception as e:
        logging.error(f"Error extracting pages {start_page}-{start_page+num_pages}: {e}")
        raise

def process_page(image):
    reader = easyocr.Reader(['en'], gpu=True)
    return reader.readtext(image)

def process_chunk(pdf_path, start_page, num_pages):
    images = extract_images_from_pdf_chunk(pdf_path, start_page, num_pages)
    results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_page, image) for image in images]
        for future in concurrent.futures.as_completed(futures):
            try:
                results.append(future.result())
            except Exception as e:
                logging.error(f"Error processing page: {e}")
    return results

def main():
    parser = argparse.ArgumentParser(description='OCR a PDF file using EasyOCR')
    parser.add_argument('pdf_path', type=str, help='Path to the PDF file')
    parser.add_argument('--chunk-size', type=int, default=100,
                        help='Number of pages to process in each chunk')
    args = parser.parse_args()

    pdf_path = Path(args.pdf_path)
    if not pdf_path.exists():
        print(f"Error: File {pdf_path} does not exist")
        sys.exit(1)

    setup_logging()
    logging.info(f"Starting OCR process for {pdf_path}")

    # Create output directory
    output_dir = pdf_path.parent / f"{pdf_path.stem}_ocr_results"
    output_dir.mkdir(exist_ok=True)

    reader = PdfReader(str(pdf_path))
    total_pages = len(reader.pages)
    
    with tqdm(total=total_pages) as pbar:
        for start_page in range(1, total_pages + 1, args.chunk_size):
            chunk_size = min(args.chunk_size, total_pages - start_page + 1)
            chunk_output = output_dir / f"chunk_{start_page:06d}.txt"
            
            if chunk_output.exists():
                logging.info(f"Skipping existing chunk {start_page}")
                pbar.update(chunk_size)
                continue

            try:
                results = process_chunk(str(pdf_path), start_page, chunk_size)
                
                # Save results
                with open(chunk_output, 'w', encoding='utf-8') as f:
                    for page_num, page_results in enumerate(results, start_page):
                        f.write(f"=== Page {page_num} ===\n")
                        for text_result in page_results:
                            f.write(f"{text_result[1]}\n")
                        f.write("\n")
                
                pbar.update(chunk_size)
                logging.info(f"Completed chunk starting at page {start_page}")
                
            except Exception as e:
                logging.error(f"Failed to process chunk starting at page {start_page}: {e}")
                continue

    logging.info("OCR process complete")

if __name__ == '__main__':
    main()
Auto-update: Wed Nov 13 09:17:05 PST 2024 2024-11-13 09:17:05 -08:00			`#!/usr/bin/env python3`

			`import sys`
			`import os`
			`from pathlib import Path`
			`from pdf2image import convert_from_path # This is the correct import`
			`import easyocr`
			`from PyPDF2 import PdfReader, PdfWriter`
			`import concurrent.futures`
			`import argparse`
			`from tqdm import tqdm`
			`import logging`

			`def setup_logging():`
			`logging.basicConfig(`
			`level=logging.INFO,`
			`format='%(asctime)s - %(levelname)s - %(message)s',`
			`handlers=[`
			`logging.StreamHandler(),`
			`logging.FileHandler('ocr_process.log')`
			`]`
			`)`

			`def extract_images_from_pdf_chunk(pdf_path, start_page, num_pages):`
			`try:`
			`return convert_from_path(pdf_path, # This is the correct function name`
			`first_page=start_page,`
			`last_page=start_page + num_pages - 1,`
			`dpi=300)`
			`except Exception as e:`
			`logging.error(f"Error extracting pages {start_page}-{start_page+num_pages}: {e}")`
			`raise`

			`def process_page(image):`
			`reader = easyocr.Reader(['en'], gpu=True)`
			`return reader.readtext(image)`

			`def process_chunk(pdf_path, start_page, num_pages):`
			`images = extract_images_from_pdf_chunk(pdf_path, start_page, num_pages)`
			`results = []`
			`with concurrent.futures.ThreadPoolExecutor() as executor:`
			`futures = [executor.submit(process_page, image) for image in images]`
			`for future in concurrent.futures.as_completed(futures):`
			`try:`
			`results.append(future.result())`
			`except Exception as e:`
			`logging.error(f"Error processing page: {e}")`
			`return results`

			`def main():`
			`parser = argparse.ArgumentParser(description='OCR a PDF file using EasyOCR')`
			`parser.add_argument('pdf_path', type=str, help='Path to the PDF file')`
			`parser.add_argument('--chunk-size', type=int, default=100,`
			`help='Number of pages to process in each chunk')`
			`args = parser.parse_args()`

			`pdf_path = Path(args.pdf_path)`
			`if not pdf_path.exists():`
			`print(f"Error: File {pdf_path} does not exist")`
			`sys.exit(1)`

			`setup_logging()`
			`logging.info(f"Starting OCR process for {pdf_path}")`

			`# Create output directory`
			`output_dir = pdf_path.parent / f"{pdf_path.stem}_ocr_results"`
			`output_dir.mkdir(exist_ok=True)`

			`reader = PdfReader(str(pdf_path))`
			`total_pages = len(reader.pages)`

			`with tqdm(total=total_pages) as pbar:`
			`for start_page in range(1, total_pages + 1, args.chunk_size):`
			`chunk_size = min(args.chunk_size, total_pages - start_page + 1)`
			`chunk_output = output_dir / f"chunk_{start_page:06d}.txt"`

			`if chunk_output.exists():`
			`logging.info(f"Skipping existing chunk {start_page}")`
			`pbar.update(chunk_size)`
			`continue`

			`try:`
			`results = process_chunk(str(pdf_path), start_page, chunk_size)`

			`# Save results`
			`with open(chunk_output, 'w', encoding='utf-8') as f:`
			`for page_num, page_results in enumerate(results, start_page):`
			`f.write(f"=== Page {page_num} ===\n")`
			`for text_result in page_results:`
			`f.write(f"{text_result[1]}\n")`
			`f.write("\n")`

			`pbar.update(chunk_size)`
			`logging.info(f"Completed chunk starting at page {start_page}")`

			`except Exception as e:`
			`logging.error(f"Failed to process chunk starting at page {start_page}: {e}")`
			`continue`

			`logging.info("OCR process complete")`

			`if __name__ == '__main__':`
			`main()`