Auto-update: Sat Oct 26 13:45:39 PDT 2024
This commit is contained in:
parent
e4dc0ab99a
commit
99d919cb27
1 changed files with 202 additions and 97 deletions
299
bates
299
bates
|
@ -133,95 +133,126 @@ def ocr_page(pdf_path, page_num):
|
||||||
logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}")
|
logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def extract_text_from_page(page, pdf_path, page_num, use_ocr):
|
def extract_text_from_page_multilayer(page, pdf_path, page_num):
|
||||||
"""Extract text from a page, using OCR if enabled and needed."""
|
"""Extract text from different PDF layers."""
|
||||||
filename = Path(pdf_path).name
|
filename = Path(pdf_path).name
|
||||||
# Get page dimensions
|
# Get page dimensions
|
||||||
width = page.width
|
width = page.width
|
||||||
height = page.height
|
height = page.height
|
||||||
|
|
||||||
# Calculate crop box for bottom fifth of page
|
# Calculate crop box for bottom fifth of page
|
||||||
padding = 2 # 2 point padding
|
padding = 2
|
||||||
|
|
||||||
# Start at 80% down the page (leaving bottom fifth)
|
|
||||||
y0 = max(0, min(height * 0.8, height - padding))
|
y0 = max(0, min(height * 0.8, height - padding))
|
||||||
y1 = max(y0 + padding, min(height, height))
|
y1 = max(y0 + padding, min(height, height))
|
||||||
|
|
||||||
# Use full width
|
|
||||||
x0 = padding
|
x0 = padding
|
||||||
x1 = max(x0 + padding, min(width - padding, width))
|
x1 = max(x0 + padding, min(width - padding, width))
|
||||||
|
|
||||||
# Ensure the crop box makes sense
|
crop_box = (x0, y0, x1, y1)
|
||||||
if x1 <= x0 or y1 <= y0:
|
|
||||||
logging.warning(f"[{filename}] Page {page_num}: Invalid crop box dimensions, using full page")
|
|
||||||
x0, y0 = 0, 0
|
|
||||||
x1, y1 = width, height
|
|
||||||
|
|
||||||
logging.info(f"[{filename}] Page {page_num}: Page size: {width}x{height} points")
|
logging.info(f"[{filename}] Page {page_num}: Dimensions {width}x{height}, crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
|
||||||
logging.info(f"[{filename}] Page {page_num}: Crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
|
|
||||||
|
texts = []
|
||||||
|
|
||||||
|
# Method 1: Try regular text extraction
|
||||||
try:
|
try:
|
||||||
# Extract text from the crop box
|
text = page.crop(crop_box).extract_text()
|
||||||
cropped_text = page.crop((x0, y0, x1, y1)).extract_text() or ""
|
if text:
|
||||||
logging.info(f"[{filename}] Page {page_num}: Cropped text: '{cropped_text}'")
|
logging.info(f"[{filename}] Page {page_num}: Regular extraction found: '{text}'")
|
||||||
|
texts.append(text)
|
||||||
# If we don't find anything in the crop, try the full page
|
|
||||||
if not cropped_text.strip():
|
|
||||||
logging.info(f"[{filename}] Page {page_num}: No text in crop box, trying full page")
|
|
||||||
full_text = page.extract_text() or ""
|
|
||||||
logging.info(f"[{filename}] Page {page_num}: Full page text: '{full_text}'")
|
|
||||||
return full_text
|
|
||||||
|
|
||||||
return cropped_text
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
|
logging.debug(f"[{filename}] Page {page_num}: Regular text extraction failed: {e}")
|
||||||
# If crop fails, try extracting text from the entire page
|
|
||||||
|
# Method 2: Try extracting words individually
|
||||||
|
try:
|
||||||
|
words = page.crop(crop_box).extract_words()
|
||||||
|
if words:
|
||||||
|
text = ' '.join(word['text'] for word in words)
|
||||||
|
logging.info(f"[{filename}] Page {page_num}: Word extraction found: '{text}'")
|
||||||
|
texts.append(text)
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"[{filename}] Page {page_num}: Word extraction failed: {e}")
|
||||||
|
|
||||||
|
# Method 3: Try extracting characters individually
|
||||||
|
try:
|
||||||
|
chars = page.crop(crop_box).chars
|
||||||
|
if chars:
|
||||||
|
text = ''.join(char['text'] for char in chars)
|
||||||
|
logging.info(f"[{filename}] Page {page_num}: Character extraction found: '{text}'")
|
||||||
|
texts.append(text)
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"[{filename}] Page {page_num}: Character extraction failed: {e}")
|
||||||
|
|
||||||
|
# Method 4: Try extracting annotations
|
||||||
|
try:
|
||||||
|
annots = page.annots
|
||||||
|
if annots and isinstance(annots, list): # Fix for the error
|
||||||
|
for annot in annots:
|
||||||
|
if isinstance(annot, dict) and 'contents' in annot:
|
||||||
|
text = annot['contents']
|
||||||
|
if text and not isinstance(text, str):
|
||||||
|
text = str(text)
|
||||||
|
if text and text.lower() != 'none':
|
||||||
|
logging.info(f"[{filename}] Page {page_num}: Annotation found: '{text}'")
|
||||||
|
texts.append(text)
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"[{filename}] Page {page_num}: Annotation extraction failed: {e}")
|
||||||
|
|
||||||
|
# Method 5: Try extracting text in reverse order
|
||||||
|
try:
|
||||||
|
chars = sorted(page.crop(crop_box).chars, key=lambda x: (-x['top'], x['x0']))
|
||||||
|
if chars:
|
||||||
|
text = ''.join(char['text'] for char in chars)
|
||||||
|
logging.info(f"[{filename}] Page {page_num}: Reverse order extraction found: '{text}'")
|
||||||
|
texts.append(text)
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"[{filename}] Page {page_num}: Reverse order extraction failed: {e}")
|
||||||
|
|
||||||
|
# Method 6: Last resort - flatten and OCR the crop box
|
||||||
|
if not texts:
|
||||||
try:
|
try:
|
||||||
logging.info(f"[{filename}] Attempting to extract text from full page")
|
logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
|
||||||
text = page.extract_text() or ""
|
# Import needed only if we get this far
|
||||||
logging.info(f"[{filename}] Page {page_num}: Full page text: '{text}'")
|
from pdf2image import convert_from_bytes
|
||||||
return text
|
import pytesseract
|
||||||
except Exception as e2:
|
|
||||||
logging.error(f"[{filename}] Error extracting text from full page: {str(e2)}")
|
# Convert just this page to image
|
||||||
return ""
|
with tempfile.NamedTemporaryFile(suffix='.pdf') as tmp_pdf:
|
||||||
|
# Save just this page to a temporary PDF
|
||||||
|
writer = pdfplumber.PDF(page.page_obj)
|
||||||
|
writer.save(tmp_pdf.name)
|
||||||
|
|
||||||
|
# Convert to image
|
||||||
|
images = convert_from_bytes(open(tmp_pdf.name, 'rb').read())
|
||||||
|
if images:
|
||||||
|
# Crop the image to our area of interest
|
||||||
|
img = images[0]
|
||||||
|
img_width, img_height = img.size
|
||||||
|
crop_box_pixels = (
|
||||||
|
int(x0 * img_width / width),
|
||||||
|
int(y0 * img_height / height),
|
||||||
|
int(x1 * img_width / width),
|
||||||
|
int(y1 * img_height / height)
|
||||||
|
)
|
||||||
|
cropped = img.crop(crop_box_pixels)
|
||||||
|
|
||||||
|
# OCR the cropped area
|
||||||
|
text = pytesseract.image_to_string(cropped)
|
||||||
|
if text:
|
||||||
|
logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
|
||||||
|
texts.append(text)
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
|
||||||
|
|
||||||
|
return texts
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_page_old(page, pdf_path, page_num, use_ocr):
|
def find_bates_number(texts, pattern):
|
||||||
"""Extract text from a page, using OCR if enabled and needed."""
|
"""Try to find Bates number in multiple text layers."""
|
||||||
filename = Path(pdf_path).name
|
for text in texts:
|
||||||
# Get page dimensions
|
matches = list(re.finditer(pattern, text))
|
||||||
width = page.width
|
if matches:
|
||||||
height = page.height
|
return matches[-1] # Return last match if found
|
||||||
|
return None
|
||||||
# Calculate crop box as relative position (bottom right corner)
|
|
||||||
# Use relative positioning and ensure we stay within bounds
|
|
||||||
x0 = min(width * 0.67, width - 10) # Start at 2/3 of the width, but ensure we stay in bounds
|
|
||||||
y0 = min(height * 0.83, height - 10) # Start at 5/6 of the height, but ensure we stay in bounds
|
|
||||||
x1 = width # Full width
|
|
||||||
y1 = height # Full height
|
|
||||||
|
|
||||||
# Ensure our crop box is within bounds
|
|
||||||
x0 = max(0, min(x0, width))
|
|
||||||
y0 = max(0, min(y0, height))
|
|
||||||
x1 = max(0, min(x1, width))
|
|
||||||
y1 = max(0, min(y1, height))
|
|
||||||
|
|
||||||
logging.debug(f"[{filename}] Page {page_num}: dimensions {width}x{height}, crop box: ({x0}, {y0}, {x1}, {y1})")
|
|
||||||
|
|
||||||
try:
|
|
||||||
text = page.crop((x0, y0, x1, y1)).extract_text() or ""
|
|
||||||
logging.debug(f"[{filename}] Page {page_num}: extracted text: '{text}'")
|
|
||||||
|
|
||||||
if use_ocr and len(text.split()) < 2:
|
|
||||||
logging.info(f"[{filename}] Page {page_num}: has less than 2 words, attempting OCR")
|
|
||||||
text = ocr_page(pdf_path, page_num)
|
|
||||||
logging.debug(f"[{filename}] Page {page_num}: OCR text: '{text}'")
|
|
||||||
|
|
||||||
return text
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def extract_bates_numbers(pdf_path, pattern, use_ocr):
|
def extract_bates_numbers(pdf_path, pattern, use_ocr):
|
||||||
"""Extract Bates numbers from first and last page of PDF using provided pattern."""
|
"""Extract Bates numbers from first and last page of PDF using provided pattern."""
|
||||||
|
@ -231,34 +262,47 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
first_page = pdf.pages[0]
|
first_page = pdf.pages[0]
|
||||||
last_page = pdf.pages[-1]
|
last_page = pdf.pages[-1]
|
||||||
|
|
||||||
logging.debug(f"[{filename}] PDF has {len(pdf.pages)} pages")
|
# Try all PDF layers first
|
||||||
|
first_texts = extract_text_from_page_multilayer(first_page, pdf_path, 0)
|
||||||
first_text = extract_text_from_page(first_page, pdf_path, 0, use_ocr)
|
last_texts = extract_text_from_page_multilayer(last_page, pdf_path, len(pdf.pages)-1)
|
||||||
last_text = extract_text_from_page(last_page, pdf_path, len(pdf.pages)-1, use_ocr)
|
|
||||||
|
first_match = find_bates_number(first_texts, pattern)
|
||||||
logging.debug(f"[{filename}] First page text: '{first_text}'")
|
last_match = find_bates_number(last_texts, pattern)
|
||||||
logging.debug(f"[{filename}] Last page text: '{last_text}'")
|
|
||||||
|
# If no matches found, try flatten and OCR
|
||||||
first_matches = list(re.finditer(pattern, first_text))
|
if not first_match or not last_match:
|
||||||
last_matches = list(re.finditer(pattern, last_text))
|
logging.info(f"[{filename}] No matches in text layers, attempting flatten/OCR")
|
||||||
|
|
||||||
logging.debug(f"[{filename}] First page matches: {[m.group(0) for m in first_matches]}")
|
# For first page
|
||||||
logging.debug(f"[{filename}] Last page matches: {[m.group(0) for m in last_matches]}")
|
if not first_match:
|
||||||
|
try:
|
||||||
first_match = first_matches[-1] if first_matches else None
|
flattened_text = flatten_and_ocr_page(first_page, pdf_path, 0)
|
||||||
last_match = last_matches[-1] if last_matches else None
|
if flattened_text:
|
||||||
|
first_texts.append(flattened_text)
|
||||||
|
matches = list(re.finditer(pattern, flattened_text))
|
||||||
|
if matches:
|
||||||
|
first_match = matches[-1]
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"[{filename}] Flatten/OCR failed for first page: {e}")
|
||||||
|
|
||||||
|
# For last page
|
||||||
|
if not last_match:
|
||||||
|
try:
|
||||||
|
flattened_text = flatten_and_ocr_page(last_page, pdf_path, len(pdf.pages)-1)
|
||||||
|
if flattened_text:
|
||||||
|
last_texts.append(flattened_text)
|
||||||
|
matches = list(re.finditer(pattern, flattened_text))
|
||||||
|
if matches:
|
||||||
|
last_match = matches[-1]
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"[{filename}] Flatten/OCR failed for last page: {e}")
|
||||||
|
|
||||||
if first_match and last_match:
|
if first_match and last_match:
|
||||||
# Extract just the numbers from the full match
|
|
||||||
first_num = ''.join(filter(str.isdigit, first_match.group(0)))
|
first_num = ''.join(filter(str.isdigit, first_match.group(0)))
|
||||||
last_num = ''.join(filter(str.isdigit, last_match.group(0)))
|
last_num = ''.join(filter(str.isdigit, last_match.group(0)))
|
||||||
|
|
||||||
logging.info(f"[{filename}] Found numbers: {first_num}–{last_num}")
|
logging.info(f"[{filename}] Found numbers: {first_num}–{last_num}")
|
||||||
if len(first_matches) > 1:
|
|
||||||
logging.debug(f"[{filename}] Multiple matches on first page, using last match. All matches: {[m.group(0) for m in first_matches]}")
|
|
||||||
if len(last_matches) > 1:
|
|
||||||
logging.debug(f"[{filename}] Multiple matches on last page, using last match. All matches: {[m.group(0) for m in last_matches]}")
|
|
||||||
return (first_num, last_num)
|
return (first_num, last_num)
|
||||||
else:
|
else:
|
||||||
logging.warning(f"[{filename}] No matching numbers found")
|
logging.warning(f"[{filename}] No matching numbers found")
|
||||||
|
@ -267,6 +311,64 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
|
||||||
logging.error(f"[{filename}] Error processing PDF: {str(e)}")
|
logging.error(f"[{filename}] Error processing PDF: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def flatten_and_ocr_page(page, pdf_path, page_num):
|
||||||
|
"""Flatten page and OCR the crop box area."""
|
||||||
|
filename = Path(pdf_path).name
|
||||||
|
logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Import needed only if we get this far
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
import pytesseract
|
||||||
|
import PyPDF2
|
||||||
|
|
||||||
|
# Get page dimensions
|
||||||
|
width = page.width
|
||||||
|
height = page.height
|
||||||
|
|
||||||
|
# Calculate crop box for bottom fifth
|
||||||
|
padding = 2
|
||||||
|
y0 = max(0, min(height * 0.8, height - padding))
|
||||||
|
y1 = max(y0 + padding, min(height, height))
|
||||||
|
x0 = padding
|
||||||
|
x1 = max(x0 + padding, min(width - padding, width))
|
||||||
|
|
||||||
|
# Create a single-page PDF with just this page
|
||||||
|
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
|
||||||
|
pdf_writer = PyPDF2.PdfWriter()
|
||||||
|
with open(pdf_path, 'rb') as pdf_file:
|
||||||
|
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
||||||
|
pdf_writer.add_page(pdf_reader.pages[page_num])
|
||||||
|
pdf_writer.write(tmp_pdf)
|
||||||
|
tmp_pdf.flush()
|
||||||
|
|
||||||
|
# Convert to image
|
||||||
|
images = convert_from_path(tmp_pdf.name)
|
||||||
|
if images:
|
||||||
|
# Crop the image to our area of interest
|
||||||
|
img = images[0]
|
||||||
|
img_width, img_height = img.size
|
||||||
|
crop_box_pixels = (
|
||||||
|
int(x0 * img_width / width),
|
||||||
|
int(y0 * img_height / height),
|
||||||
|
int(x1 * img_width / width),
|
||||||
|
int(y1 * img_height / height)
|
||||||
|
)
|
||||||
|
cropped = img.crop(crop_box_pixels)
|
||||||
|
|
||||||
|
# OCR the cropped area
|
||||||
|
text = pytesseract.image_to_string(cropped)
|
||||||
|
if text:
|
||||||
|
logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Clean up the temporary file
|
||||||
|
os.unlink(tmp_pdf.name)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None):
|
def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None):
|
||||||
"""Process all PDFs in the specified folder."""
|
"""Process all PDFs in the specified folder."""
|
||||||
folder = Path(folder_path)
|
folder = Path(folder_path)
|
||||||
|
@ -280,7 +382,10 @@ def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=Non
|
||||||
success_count = 0
|
success_count = 0
|
||||||
rename_count = 0
|
rename_count = 0
|
||||||
|
|
||||||
for pdf_file in folder.glob('*.pdf'):
|
# Use simple case-insensitive matching
|
||||||
|
pdf_files = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == '.pdf']
|
||||||
|
|
||||||
|
for pdf_file in pdf_files:
|
||||||
pdf_count += 1
|
pdf_count += 1
|
||||||
numbers = extract_bates_numbers(pdf_file, pattern, use_ocr)
|
numbers = extract_bates_numbers(pdf_file, pattern, use_ocr)
|
||||||
if numbers:
|
if numbers:
|
||||||
|
|
Loading…
Reference in a new issue