Auto-update: Sat Oct 26 13:45:39 PDT 2024

This commit is contained in:
sanj 2024-10-26 13:45:39 -07:00
parent e4dc0ab99a
commit 99d919cb27

299
bates
View file

@ -133,95 +133,126 @@ def ocr_page(pdf_path, page_num):
logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}") logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}")
return "" return ""
def extract_text_from_page(page, pdf_path, page_num, use_ocr): def extract_text_from_page_multilayer(page, pdf_path, page_num):
"""Extract text from a page, using OCR if enabled and needed.""" """Extract text from different PDF layers."""
filename = Path(pdf_path).name filename = Path(pdf_path).name
# Get page dimensions # Get page dimensions
width = page.width width = page.width
height = page.height height = page.height
# Calculate crop box for bottom fifth of page # Calculate crop box for bottom fifth of page
padding = 2 # 2 point padding padding = 2
# Start at 80% down the page (leaving bottom fifth)
y0 = max(0, min(height * 0.8, height - padding)) y0 = max(0, min(height * 0.8, height - padding))
y1 = max(y0 + padding, min(height, height)) y1 = max(y0 + padding, min(height, height))
# Use full width
x0 = padding x0 = padding
x1 = max(x0 + padding, min(width - padding, width)) x1 = max(x0 + padding, min(width - padding, width))
# Ensure the crop box makes sense crop_box = (x0, y0, x1, y1)
if x1 <= x0 or y1 <= y0:
logging.warning(f"[{filename}] Page {page_num}: Invalid crop box dimensions, using full page")
x0, y0 = 0, 0
x1, y1 = width, height
logging.info(f"[{filename}] Page {page_num}: Page size: {width}x{height} points") logging.info(f"[{filename}] Page {page_num}: Dimensions {width}x{height}, crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
logging.info(f"[{filename}] Page {page_num}: Crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
texts = []
# Method 1: Try regular text extraction
try: try:
# Extract text from the crop box text = page.crop(crop_box).extract_text()
cropped_text = page.crop((x0, y0, x1, y1)).extract_text() or "" if text:
logging.info(f"[{filename}] Page {page_num}: Cropped text: '{cropped_text}'") logging.info(f"[{filename}] Page {page_num}: Regular extraction found: '{text}'")
texts.append(text)
# If we don't find anything in the crop, try the full page
if not cropped_text.strip():
logging.info(f"[{filename}] Page {page_num}: No text in crop box, trying full page")
full_text = page.extract_text() or ""
logging.info(f"[{filename}] Page {page_num}: Full page text: '{full_text}'")
return full_text
return cropped_text
except Exception as e: except Exception as e:
logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}") logging.debug(f"[{filename}] Page {page_num}: Regular text extraction failed: {e}")
# If crop fails, try extracting text from the entire page
# Method 2: Try extracting words individually
try:
words = page.crop(crop_box).extract_words()
if words:
text = ' '.join(word['text'] for word in words)
logging.info(f"[{filename}] Page {page_num}: Word extraction found: '{text}'")
texts.append(text)
except Exception as e:
logging.debug(f"[{filename}] Page {page_num}: Word extraction failed: {e}")
# Method 3: Try extracting characters individually
try:
chars = page.crop(crop_box).chars
if chars:
text = ''.join(char['text'] for char in chars)
logging.info(f"[{filename}] Page {page_num}: Character extraction found: '{text}'")
texts.append(text)
except Exception as e:
logging.debug(f"[{filename}] Page {page_num}: Character extraction failed: {e}")
# Method 4: Try extracting annotations
try:
annots = page.annots
if annots and isinstance(annots, list): # Fix for the error
for annot in annots:
if isinstance(annot, dict) and 'contents' in annot:
text = annot['contents']
if text and not isinstance(text, str):
text = str(text)
if text and text.lower() != 'none':
logging.info(f"[{filename}] Page {page_num}: Annotation found: '{text}'")
texts.append(text)
except Exception as e:
logging.debug(f"[{filename}] Page {page_num}: Annotation extraction failed: {e}")
# Method 5: Try extracting text in reverse order
try:
chars = sorted(page.crop(crop_box).chars, key=lambda x: (-x['top'], x['x0']))
if chars:
text = ''.join(char['text'] for char in chars)
logging.info(f"[{filename}] Page {page_num}: Reverse order extraction found: '{text}'")
texts.append(text)
except Exception as e:
logging.debug(f"[{filename}] Page {page_num}: Reverse order extraction failed: {e}")
# Method 6: Last resort - flatten and OCR the crop box
if not texts:
try: try:
logging.info(f"[{filename}] Attempting to extract text from full page") logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
text = page.extract_text() or "" # Import needed only if we get this far
logging.info(f"[{filename}] Page {page_num}: Full page text: '{text}'") from pdf2image import convert_from_bytes
return text import pytesseract
except Exception as e2:
logging.error(f"[{filename}] Error extracting text from full page: {str(e2)}") # Convert just this page to image
return "" with tempfile.NamedTemporaryFile(suffix='.pdf') as tmp_pdf:
# Save just this page to a temporary PDF
writer = pdfplumber.PDF(page.page_obj)
writer.save(tmp_pdf.name)
# Convert to image
images = convert_from_bytes(open(tmp_pdf.name, 'rb').read())
if images:
# Crop the image to our area of interest
img = images[0]
img_width, img_height = img.size
crop_box_pixels = (
int(x0 * img_width / width),
int(y0 * img_height / height),
int(x1 * img_width / width),
int(y1 * img_height / height)
)
cropped = img.crop(crop_box_pixels)
# OCR the cropped area
text = pytesseract.image_to_string(cropped)
if text:
logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
texts.append(text)
except Exception as e:
logging.debug(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
return texts
def extract_text_from_page_old(page, pdf_path, page_num, use_ocr): def find_bates_number(texts, pattern):
"""Extract text from a page, using OCR if enabled and needed.""" """Try to find Bates number in multiple text layers."""
filename = Path(pdf_path).name for text in texts:
# Get page dimensions matches = list(re.finditer(pattern, text))
width = page.width if matches:
height = page.height return matches[-1] # Return last match if found
return None
# Calculate crop box as relative position (bottom right corner)
# Use relative positioning and ensure we stay within bounds
x0 = min(width * 0.67, width - 10) # Start at 2/3 of the width, but ensure we stay in bounds
y0 = min(height * 0.83, height - 10) # Start at 5/6 of the height, but ensure we stay in bounds
x1 = width # Full width
y1 = height # Full height
# Ensure our crop box is within bounds
x0 = max(0, min(x0, width))
y0 = max(0, min(y0, height))
x1 = max(0, min(x1, width))
y1 = max(0, min(y1, height))
logging.debug(f"[{filename}] Page {page_num}: dimensions {width}x{height}, crop box: ({x0}, {y0}, {x1}, {y1})")
try:
text = page.crop((x0, y0, x1, y1)).extract_text() or ""
logging.debug(f"[{filename}] Page {page_num}: extracted text: '{text}'")
if use_ocr and len(text.split()) < 2:
logging.info(f"[{filename}] Page {page_num}: has less than 2 words, attempting OCR")
text = ocr_page(pdf_path, page_num)
logging.debug(f"[{filename}] Page {page_num}: OCR text: '{text}'")
return text
except Exception as e:
logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
return ""
def extract_bates_numbers(pdf_path, pattern, use_ocr): def extract_bates_numbers(pdf_path, pattern, use_ocr):
"""Extract Bates numbers from first and last page of PDF using provided pattern.""" """Extract Bates numbers from first and last page of PDF using provided pattern."""
@ -231,34 +262,47 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
with pdfplumber.open(pdf_path) as pdf: with pdfplumber.open(pdf_path) as pdf:
first_page = pdf.pages[0] first_page = pdf.pages[0]
last_page = pdf.pages[-1] last_page = pdf.pages[-1]
logging.debug(f"[{filename}] PDF has {len(pdf.pages)} pages") # Try all PDF layers first
first_texts = extract_text_from_page_multilayer(first_page, pdf_path, 0)
first_text = extract_text_from_page(first_page, pdf_path, 0, use_ocr) last_texts = extract_text_from_page_multilayer(last_page, pdf_path, len(pdf.pages)-1)
last_text = extract_text_from_page(last_page, pdf_path, len(pdf.pages)-1, use_ocr)
first_match = find_bates_number(first_texts, pattern)
logging.debug(f"[{filename}] First page text: '{first_text}'") last_match = find_bates_number(last_texts, pattern)
logging.debug(f"[{filename}] Last page text: '{last_text}'")
# If no matches found, try flatten and OCR
first_matches = list(re.finditer(pattern, first_text)) if not first_match or not last_match:
last_matches = list(re.finditer(pattern, last_text)) logging.info(f"[{filename}] No matches in text layers, attempting flatten/OCR")
logging.debug(f"[{filename}] First page matches: {[m.group(0) for m in first_matches]}") # For first page
logging.debug(f"[{filename}] Last page matches: {[m.group(0) for m in last_matches]}") if not first_match:
try:
first_match = first_matches[-1] if first_matches else None flattened_text = flatten_and_ocr_page(first_page, pdf_path, 0)
last_match = last_matches[-1] if last_matches else None if flattened_text:
first_texts.append(flattened_text)
matches = list(re.finditer(pattern, flattened_text))
if matches:
first_match = matches[-1]
except Exception as e:
logging.error(f"[{filename}] Flatten/OCR failed for first page: {e}")
# For last page
if not last_match:
try:
flattened_text = flatten_and_ocr_page(last_page, pdf_path, len(pdf.pages)-1)
if flattened_text:
last_texts.append(flattened_text)
matches = list(re.finditer(pattern, flattened_text))
if matches:
last_match = matches[-1]
except Exception as e:
logging.error(f"[{filename}] Flatten/OCR failed for last page: {e}")
if first_match and last_match: if first_match and last_match:
# Extract just the numbers from the full match
first_num = ''.join(filter(str.isdigit, first_match.group(0))) first_num = ''.join(filter(str.isdigit, first_match.group(0)))
last_num = ''.join(filter(str.isdigit, last_match.group(0))) last_num = ''.join(filter(str.isdigit, last_match.group(0)))
logging.info(f"[{filename}] Found numbers: {first_num}{last_num}") logging.info(f"[{filename}] Found numbers: {first_num}{last_num}")
if len(first_matches) > 1:
logging.debug(f"[{filename}] Multiple matches on first page, using last match. All matches: {[m.group(0) for m in first_matches]}")
if len(last_matches) > 1:
logging.debug(f"[{filename}] Multiple matches on last page, using last match. All matches: {[m.group(0) for m in last_matches]}")
return (first_num, last_num) return (first_num, last_num)
else: else:
logging.warning(f"[{filename}] No matching numbers found") logging.warning(f"[{filename}] No matching numbers found")
@ -267,6 +311,64 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
logging.error(f"[{filename}] Error processing PDF: {str(e)}") logging.error(f"[{filename}] Error processing PDF: {str(e)}")
return None return None
def flatten_and_ocr_page(page, pdf_path, page_num):
"""Flatten page and OCR the crop box area."""
filename = Path(pdf_path).name
logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
try:
# Import needed only if we get this far
from pdf2image import convert_from_path
import pytesseract
import PyPDF2
# Get page dimensions
width = page.width
height = page.height
# Calculate crop box for bottom fifth
padding = 2
y0 = max(0, min(height * 0.8, height - padding))
y1 = max(y0 + padding, min(height, height))
x0 = padding
x1 = max(x0 + padding, min(width - padding, width))
# Create a single-page PDF with just this page
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
pdf_writer = PyPDF2.PdfWriter()
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
pdf_writer.add_page(pdf_reader.pages[page_num])
pdf_writer.write(tmp_pdf)
tmp_pdf.flush()
# Convert to image
images = convert_from_path(tmp_pdf.name)
if images:
# Crop the image to our area of interest
img = images[0]
img_width, img_height = img.size
crop_box_pixels = (
int(x0 * img_width / width),
int(y0 * img_height / height),
int(x1 * img_width / width),
int(y1 * img_height / height)
)
cropped = img.crop(crop_box_pixels)
# OCR the cropped area
text = pytesseract.image_to_string(cropped)
if text:
logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
return text
# Clean up the temporary file
os.unlink(tmp_pdf.name)
except Exception as e:
logging.error(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
return None
def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None): def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None):
"""Process all PDFs in the specified folder.""" """Process all PDFs in the specified folder."""
folder = Path(folder_path) folder = Path(folder_path)
@ -280,7 +382,10 @@ def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=Non
success_count = 0 success_count = 0
rename_count = 0 rename_count = 0
for pdf_file in folder.glob('*.pdf'): # Use simple case-insensitive matching
pdf_files = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == '.pdf']
for pdf_file in pdf_files:
pdf_count += 1 pdf_count += 1
numbers = extract_bates_numbers(pdf_file, pattern, use_ocr) numbers = extract_bates_numbers(pdf_file, pattern, use_ocr)
if numbers: if numbers: