Auto-update: Sat Oct 26 13:45:39 PDT 2024
This commit is contained in:
parent
e4dc0ab99a
commit
99d919cb27
1 changed files with 202 additions and 97 deletions
299
bates
299
bates
|
@ -133,95 +133,126 @@ def ocr_page(pdf_path, page_num):
|
|||
logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}")
|
||||
return ""
|
||||
|
||||
def extract_text_from_page(page, pdf_path, page_num, use_ocr):
|
||||
"""Extract text from a page, using OCR if enabled and needed."""
|
||||
def extract_text_from_page_multilayer(page, pdf_path, page_num):
|
||||
"""Extract text from different PDF layers."""
|
||||
filename = Path(pdf_path).name
|
||||
# Get page dimensions
|
||||
width = page.width
|
||||
height = page.height
|
||||
|
||||
# Calculate crop box for bottom fifth of page
|
||||
padding = 2 # 2 point padding
|
||||
|
||||
# Start at 80% down the page (leaving bottom fifth)
|
||||
padding = 2
|
||||
y0 = max(0, min(height * 0.8, height - padding))
|
||||
y1 = max(y0 + padding, min(height, height))
|
||||
|
||||
# Use full width
|
||||
x0 = padding
|
||||
x1 = max(x0 + padding, min(width - padding, width))
|
||||
|
||||
# Ensure the crop box makes sense
|
||||
if x1 <= x0 or y1 <= y0:
|
||||
logging.warning(f"[{filename}] Page {page_num}: Invalid crop box dimensions, using full page")
|
||||
x0, y0 = 0, 0
|
||||
x1, y1 = width, height
|
||||
crop_box = (x0, y0, x1, y1)
|
||||
|
||||
logging.info(f"[{filename}] Page {page_num}: Page size: {width}x{height} points")
|
||||
logging.info(f"[{filename}] Page {page_num}: Crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
|
||||
|
||||
logging.info(f"[{filename}] Page {page_num}: Dimensions {width}x{height}, crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
|
||||
|
||||
texts = []
|
||||
|
||||
# Method 1: Try regular text extraction
|
||||
try:
|
||||
# Extract text from the crop box
|
||||
cropped_text = page.crop((x0, y0, x1, y1)).extract_text() or ""
|
||||
logging.info(f"[{filename}] Page {page_num}: Cropped text: '{cropped_text}'")
|
||||
|
||||
# If we don't find anything in the crop, try the full page
|
||||
if not cropped_text.strip():
|
||||
logging.info(f"[{filename}] Page {page_num}: No text in crop box, trying full page")
|
||||
full_text = page.extract_text() or ""
|
||||
logging.info(f"[{filename}] Page {page_num}: Full page text: '{full_text}'")
|
||||
return full_text
|
||||
|
||||
return cropped_text
|
||||
|
||||
text = page.crop(crop_box).extract_text()
|
||||
if text:
|
||||
logging.info(f"[{filename}] Page {page_num}: Regular extraction found: '{text}'")
|
||||
texts.append(text)
|
||||
except Exception as e:
|
||||
logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
|
||||
# If crop fails, try extracting text from the entire page
|
||||
logging.debug(f"[{filename}] Page {page_num}: Regular text extraction failed: {e}")
|
||||
|
||||
# Method 2: Try extracting words individually
|
||||
try:
|
||||
words = page.crop(crop_box).extract_words()
|
||||
if words:
|
||||
text = ' '.join(word['text'] for word in words)
|
||||
logging.info(f"[{filename}] Page {page_num}: Word extraction found: '{text}'")
|
||||
texts.append(text)
|
||||
except Exception as e:
|
||||
logging.debug(f"[{filename}] Page {page_num}: Word extraction failed: {e}")
|
||||
|
||||
# Method 3: Try extracting characters individually
|
||||
try:
|
||||
chars = page.crop(crop_box).chars
|
||||
if chars:
|
||||
text = ''.join(char['text'] for char in chars)
|
||||
logging.info(f"[{filename}] Page {page_num}: Character extraction found: '{text}'")
|
||||
texts.append(text)
|
||||
except Exception as e:
|
||||
logging.debug(f"[{filename}] Page {page_num}: Character extraction failed: {e}")
|
||||
|
||||
# Method 4: Try extracting annotations
|
||||
try:
|
||||
annots = page.annots
|
||||
if annots and isinstance(annots, list): # Fix for the error
|
||||
for annot in annots:
|
||||
if isinstance(annot, dict) and 'contents' in annot:
|
||||
text = annot['contents']
|
||||
if text and not isinstance(text, str):
|
||||
text = str(text)
|
||||
if text and text.lower() != 'none':
|
||||
logging.info(f"[{filename}] Page {page_num}: Annotation found: '{text}'")
|
||||
texts.append(text)
|
||||
except Exception as e:
|
||||
logging.debug(f"[{filename}] Page {page_num}: Annotation extraction failed: {e}")
|
||||
|
||||
# Method 5: Try extracting text in reverse order
|
||||
try:
|
||||
chars = sorted(page.crop(crop_box).chars, key=lambda x: (-x['top'], x['x0']))
|
||||
if chars:
|
||||
text = ''.join(char['text'] for char in chars)
|
||||
logging.info(f"[{filename}] Page {page_num}: Reverse order extraction found: '{text}'")
|
||||
texts.append(text)
|
||||
except Exception as e:
|
||||
logging.debug(f"[{filename}] Page {page_num}: Reverse order extraction failed: {e}")
|
||||
|
||||
# Method 6: Last resort - flatten and OCR the crop box
|
||||
if not texts:
|
||||
try:
|
||||
logging.info(f"[{filename}] Attempting to extract text from full page")
|
||||
text = page.extract_text() or ""
|
||||
logging.info(f"[{filename}] Page {page_num}: Full page text: '{text}'")
|
||||
return text
|
||||
except Exception as e2:
|
||||
logging.error(f"[{filename}] Error extracting text from full page: {str(e2)}")
|
||||
return ""
|
||||
logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
|
||||
# Import needed only if we get this far
|
||||
from pdf2image import convert_from_bytes
|
||||
import pytesseract
|
||||
|
||||
# Convert just this page to image
|
||||
with tempfile.NamedTemporaryFile(suffix='.pdf') as tmp_pdf:
|
||||
# Save just this page to a temporary PDF
|
||||
writer = pdfplumber.PDF(page.page_obj)
|
||||
writer.save(tmp_pdf.name)
|
||||
|
||||
# Convert to image
|
||||
images = convert_from_bytes(open(tmp_pdf.name, 'rb').read())
|
||||
if images:
|
||||
# Crop the image to our area of interest
|
||||
img = images[0]
|
||||
img_width, img_height = img.size
|
||||
crop_box_pixels = (
|
||||
int(x0 * img_width / width),
|
||||
int(y0 * img_height / height),
|
||||
int(x1 * img_width / width),
|
||||
int(y1 * img_height / height)
|
||||
)
|
||||
cropped = img.crop(crop_box_pixels)
|
||||
|
||||
# OCR the cropped area
|
||||
text = pytesseract.image_to_string(cropped)
|
||||
if text:
|
||||
logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
|
||||
texts.append(text)
|
||||
except Exception as e:
|
||||
logging.debug(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
def extract_text_from_page_old(page, pdf_path, page_num, use_ocr):
|
||||
"""Extract text from a page, using OCR if enabled and needed."""
|
||||
filename = Path(pdf_path).name
|
||||
# Get page dimensions
|
||||
width = page.width
|
||||
height = page.height
|
||||
|
||||
# Calculate crop box as relative position (bottom right corner)
|
||||
# Use relative positioning and ensure we stay within bounds
|
||||
x0 = min(width * 0.67, width - 10) # Start at 2/3 of the width, but ensure we stay in bounds
|
||||
y0 = min(height * 0.83, height - 10) # Start at 5/6 of the height, but ensure we stay in bounds
|
||||
x1 = width # Full width
|
||||
y1 = height # Full height
|
||||
|
||||
# Ensure our crop box is within bounds
|
||||
x0 = max(0, min(x0, width))
|
||||
y0 = max(0, min(y0, height))
|
||||
x1 = max(0, min(x1, width))
|
||||
y1 = max(0, min(y1, height))
|
||||
|
||||
logging.debug(f"[{filename}] Page {page_num}: dimensions {width}x{height}, crop box: ({x0}, {y0}, {x1}, {y1})")
|
||||
|
||||
try:
|
||||
text = page.crop((x0, y0, x1, y1)).extract_text() or ""
|
||||
logging.debug(f"[{filename}] Page {page_num}: extracted text: '{text}'")
|
||||
|
||||
if use_ocr and len(text.split()) < 2:
|
||||
logging.info(f"[{filename}] Page {page_num}: has less than 2 words, attempting OCR")
|
||||
text = ocr_page(pdf_path, page_num)
|
||||
logging.debug(f"[{filename}] Page {page_num}: OCR text: '{text}'")
|
||||
|
||||
return text
|
||||
except Exception as e:
|
||||
logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
|
||||
return ""
|
||||
def find_bates_number(texts, pattern):
|
||||
"""Try to find Bates number in multiple text layers."""
|
||||
for text in texts:
|
||||
matches = list(re.finditer(pattern, text))
|
||||
if matches:
|
||||
return matches[-1] # Return last match if found
|
||||
return None
|
||||
|
||||
def extract_bates_numbers(pdf_path, pattern, use_ocr):
|
||||
"""Extract Bates numbers from first and last page of PDF using provided pattern."""
|
||||
|
@ -231,34 +262,47 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
|
|||
with pdfplumber.open(pdf_path) as pdf:
|
||||
first_page = pdf.pages[0]
|
||||
last_page = pdf.pages[-1]
|
||||
|
||||
logging.debug(f"[{filename}] PDF has {len(pdf.pages)} pages")
|
||||
|
||||
first_text = extract_text_from_page(first_page, pdf_path, 0, use_ocr)
|
||||
last_text = extract_text_from_page(last_page, pdf_path, len(pdf.pages)-1, use_ocr)
|
||||
|
||||
logging.debug(f"[{filename}] First page text: '{first_text}'")
|
||||
logging.debug(f"[{filename}] Last page text: '{last_text}'")
|
||||
|
||||
first_matches = list(re.finditer(pattern, first_text))
|
||||
last_matches = list(re.finditer(pattern, last_text))
|
||||
|
||||
logging.debug(f"[{filename}] First page matches: {[m.group(0) for m in first_matches]}")
|
||||
logging.debug(f"[{filename}] Last page matches: {[m.group(0) for m in last_matches]}")
|
||||
|
||||
first_match = first_matches[-1] if first_matches else None
|
||||
last_match = last_matches[-1] if last_matches else None
|
||||
|
||||
|
||||
# Try all PDF layers first
|
||||
first_texts = extract_text_from_page_multilayer(first_page, pdf_path, 0)
|
||||
last_texts = extract_text_from_page_multilayer(last_page, pdf_path, len(pdf.pages)-1)
|
||||
|
||||
first_match = find_bates_number(first_texts, pattern)
|
||||
last_match = find_bates_number(last_texts, pattern)
|
||||
|
||||
# If no matches found, try flatten and OCR
|
||||
if not first_match or not last_match:
|
||||
logging.info(f"[{filename}] No matches in text layers, attempting flatten/OCR")
|
||||
|
||||
# For first page
|
||||
if not first_match:
|
||||
try:
|
||||
flattened_text = flatten_and_ocr_page(first_page, pdf_path, 0)
|
||||
if flattened_text:
|
||||
first_texts.append(flattened_text)
|
||||
matches = list(re.finditer(pattern, flattened_text))
|
||||
if matches:
|
||||
first_match = matches[-1]
|
||||
except Exception as e:
|
||||
logging.error(f"[{filename}] Flatten/OCR failed for first page: {e}")
|
||||
|
||||
# For last page
|
||||
if not last_match:
|
||||
try:
|
||||
flattened_text = flatten_and_ocr_page(last_page, pdf_path, len(pdf.pages)-1)
|
||||
if flattened_text:
|
||||
last_texts.append(flattened_text)
|
||||
matches = list(re.finditer(pattern, flattened_text))
|
||||
if matches:
|
||||
last_match = matches[-1]
|
||||
except Exception as e:
|
||||
logging.error(f"[{filename}] Flatten/OCR failed for last page: {e}")
|
||||
|
||||
if first_match and last_match:
|
||||
# Extract just the numbers from the full match
|
||||
first_num = ''.join(filter(str.isdigit, first_match.group(0)))
|
||||
last_num = ''.join(filter(str.isdigit, last_match.group(0)))
|
||||
|
||||
|
||||
logging.info(f"[{filename}] Found numbers: {first_num}–{last_num}")
|
||||
if len(first_matches) > 1:
|
||||
logging.debug(f"[{filename}] Multiple matches on first page, using last match. All matches: {[m.group(0) for m in first_matches]}")
|
||||
if len(last_matches) > 1:
|
||||
logging.debug(f"[{filename}] Multiple matches on last page, using last match. All matches: {[m.group(0) for m in last_matches]}")
|
||||
return (first_num, last_num)
|
||||
else:
|
||||
logging.warning(f"[{filename}] No matching numbers found")
|
||||
|
@ -267,6 +311,64 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
|
|||
logging.error(f"[{filename}] Error processing PDF: {str(e)}")
|
||||
return None
|
||||
|
||||
def flatten_and_ocr_page(page, pdf_path, page_num):
|
||||
"""Flatten page and OCR the crop box area."""
|
||||
filename = Path(pdf_path).name
|
||||
logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
|
||||
|
||||
try:
|
||||
# Import needed only if we get this far
|
||||
from pdf2image import convert_from_path
|
||||
import pytesseract
|
||||
import PyPDF2
|
||||
|
||||
# Get page dimensions
|
||||
width = page.width
|
||||
height = page.height
|
||||
|
||||
# Calculate crop box for bottom fifth
|
||||
padding = 2
|
||||
y0 = max(0, min(height * 0.8, height - padding))
|
||||
y1 = max(y0 + padding, min(height, height))
|
||||
x0 = padding
|
||||
x1 = max(x0 + padding, min(width - padding, width))
|
||||
|
||||
# Create a single-page PDF with just this page
|
||||
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
|
||||
pdf_writer = PyPDF2.PdfWriter()
|
||||
with open(pdf_path, 'rb') as pdf_file:
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
||||
pdf_writer.add_page(pdf_reader.pages[page_num])
|
||||
pdf_writer.write(tmp_pdf)
|
||||
tmp_pdf.flush()
|
||||
|
||||
# Convert to image
|
||||
images = convert_from_path(tmp_pdf.name)
|
||||
if images:
|
||||
# Crop the image to our area of interest
|
||||
img = images[0]
|
||||
img_width, img_height = img.size
|
||||
crop_box_pixels = (
|
||||
int(x0 * img_width / width),
|
||||
int(y0 * img_height / height),
|
||||
int(x1 * img_width / width),
|
||||
int(y1 * img_height / height)
|
||||
)
|
||||
cropped = img.crop(crop_box_pixels)
|
||||
|
||||
# OCR the cropped area
|
||||
text = pytesseract.image_to_string(cropped)
|
||||
if text:
|
||||
logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
|
||||
return text
|
||||
|
||||
# Clean up the temporary file
|
||||
os.unlink(tmp_pdf.name)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
|
||||
return None
|
||||
|
||||
def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None):
|
||||
"""Process all PDFs in the specified folder."""
|
||||
folder = Path(folder_path)
|
||||
|
@ -280,7 +382,10 @@ def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=Non
|
|||
success_count = 0
|
||||
rename_count = 0
|
||||
|
||||
for pdf_file in folder.glob('*.pdf'):
|
||||
# Use simple case-insensitive matching
|
||||
pdf_files = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == '.pdf']
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
pdf_count += 1
|
||||
numbers = extract_bates_numbers(pdf_file, pattern, use_ocr)
|
||||
if numbers:
|
||||
|
|
Loading…
Reference in a new issue