105 lines
3.6 KiB
Text
105 lines
3.6 KiB
Text
|
#!/usr/bin/env python3
|
||
|
|
||
|
import sys
|
||
|
import os
|
||
|
from pathlib import Path
|
||
|
from pdf2image import convert_from_path # This is the correct import
|
||
|
import easyocr
|
||
|
from PyPDF2 import PdfReader, PdfWriter
|
||
|
import concurrent.futures
|
||
|
import argparse
|
||
|
from tqdm import tqdm
|
||
|
import logging
|
||
|
|
||
|
def setup_logging():
|
||
|
logging.basicConfig(
|
||
|
level=logging.INFO,
|
||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
|
handlers=[
|
||
|
logging.StreamHandler(),
|
||
|
logging.FileHandler('ocr_process.log')
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def extract_images_from_pdf_chunk(pdf_path, start_page, num_pages):
|
||
|
try:
|
||
|
return convert_from_path(pdf_path, # This is the correct function name
|
||
|
first_page=start_page,
|
||
|
last_page=start_page + num_pages - 1,
|
||
|
dpi=300)
|
||
|
except Exception as e:
|
||
|
logging.error(f"Error extracting pages {start_page}-{start_page+num_pages}: {e}")
|
||
|
raise
|
||
|
|
||
|
def process_page(image):
|
||
|
reader = easyocr.Reader(['en'], gpu=True)
|
||
|
return reader.readtext(image)
|
||
|
|
||
|
def process_chunk(pdf_path, start_page, num_pages):
|
||
|
images = extract_images_from_pdf_chunk(pdf_path, start_page, num_pages)
|
||
|
results = []
|
||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||
|
futures = [executor.submit(process_page, image) for image in images]
|
||
|
for future in concurrent.futures.as_completed(futures):
|
||
|
try:
|
||
|
results.append(future.result())
|
||
|
except Exception as e:
|
||
|
logging.error(f"Error processing page: {e}")
|
||
|
return results
|
||
|
|
||
|
def main():
|
||
|
parser = argparse.ArgumentParser(description='OCR a PDF file using EasyOCR')
|
||
|
parser.add_argument('pdf_path', type=str, help='Path to the PDF file')
|
||
|
parser.add_argument('--chunk-size', type=int, default=100,
|
||
|
help='Number of pages to process in each chunk')
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
pdf_path = Path(args.pdf_path)
|
||
|
if not pdf_path.exists():
|
||
|
print(f"Error: File {pdf_path} does not exist")
|
||
|
sys.exit(1)
|
||
|
|
||
|
setup_logging()
|
||
|
logging.info(f"Starting OCR process for {pdf_path}")
|
||
|
|
||
|
# Create output directory
|
||
|
output_dir = pdf_path.parent / f"{pdf_path.stem}_ocr_results"
|
||
|
output_dir.mkdir(exist_ok=True)
|
||
|
|
||
|
reader = PdfReader(str(pdf_path))
|
||
|
total_pages = len(reader.pages)
|
||
|
|
||
|
with tqdm(total=total_pages) as pbar:
|
||
|
for start_page in range(1, total_pages + 1, args.chunk_size):
|
||
|
chunk_size = min(args.chunk_size, total_pages - start_page + 1)
|
||
|
chunk_output = output_dir / f"chunk_{start_page:06d}.txt"
|
||
|
|
||
|
if chunk_output.exists():
|
||
|
logging.info(f"Skipping existing chunk {start_page}")
|
||
|
pbar.update(chunk_size)
|
||
|
continue
|
||
|
|
||
|
try:
|
||
|
results = process_chunk(str(pdf_path), start_page, chunk_size)
|
||
|
|
||
|
# Save results
|
||
|
with open(chunk_output, 'w', encoding='utf-8') as f:
|
||
|
for page_num, page_results in enumerate(results, start_page):
|
||
|
f.write(f"=== Page {page_num} ===\n")
|
||
|
for text_result in page_results:
|
||
|
f.write(f"{text_result[1]}\n")
|
||
|
f.write("\n")
|
||
|
|
||
|
pbar.update(chunk_size)
|
||
|
logging.info(f"Completed chunk starting at page {start_page}")
|
||
|
|
||
|
except Exception as e:
|
||
|
logging.error(f"Failed to process chunk starting at page {start_page}: {e}")
|
||
|
continue
|
||
|
|
||
|
logging.info("OCR process complete")
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|
||
|
|