From 99d919cb27730e29b7773057c8568f98277d198f Mon Sep 17 00:00:00 2001 From: sanj <67624670+iodrift@users.noreply.github.com> Date: Sat, 26 Oct 2024 13:45:39 -0700 Subject: [PATCH] Auto-update: Sat Oct 26 13:45:39 PDT 2024 --- bates | 299 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 202 insertions(+), 97 deletions(-) diff --git a/bates b/bates index d1f4fff..0ade2f7 100755 --- a/bates +++ b/bates @@ -133,95 +133,126 @@ def ocr_page(pdf_path, page_num): logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}") return "" -def extract_text_from_page(page, pdf_path, page_num, use_ocr): - """Extract text from a page, using OCR if enabled and needed.""" +def extract_text_from_page_multilayer(page, pdf_path, page_num): + """Extract text from different PDF layers.""" filename = Path(pdf_path).name # Get page dimensions width = page.width height = page.height # Calculate crop box for bottom fifth of page - padding = 2 # 2 point padding - - # Start at 80% down the page (leaving bottom fifth) + padding = 2 y0 = max(0, min(height * 0.8, height - padding)) y1 = max(y0 + padding, min(height, height)) - - # Use full width x0 = padding x1 = max(x0 + padding, min(width - padding, width)) - # Ensure the crop box makes sense - if x1 <= x0 or y1 <= y0: - logging.warning(f"[{filename}] Page {page_num}: Invalid crop box dimensions, using full page") - x0, y0 = 0, 0 - x1, y1 = width, height + crop_box = (x0, y0, x1, y1) - logging.info(f"[{filename}] Page {page_num}: Page size: {width}x{height} points") - logging.info(f"[{filename}] Page {page_num}: Crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})") - + logging.info(f"[{filename}] Page {page_num}: Dimensions {width}x{height}, crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})") + + texts = [] + + # Method 1: Try regular text extraction try: - # Extract text from the crop box - cropped_text = page.crop((x0, y0, x1, y1)).extract_text() or "" - logging.info(f"[{filename}] Page {page_num}: Cropped text: '{cropped_text}'") - - # If we don't find anything in the crop, try the full page - if not cropped_text.strip(): - logging.info(f"[{filename}] Page {page_num}: No text in crop box, trying full page") - full_text = page.extract_text() or "" - logging.info(f"[{filename}] Page {page_num}: Full page text: '{full_text}'") - return full_text - - return cropped_text - + text = page.crop(crop_box).extract_text() + if text: + logging.info(f"[{filename}] Page {page_num}: Regular extraction found: '{text}'") + texts.append(text) except Exception as e: - logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}") - # If crop fails, try extracting text from the entire page + logging.debug(f"[{filename}] Page {page_num}: Regular text extraction failed: {e}") + + # Method 2: Try extracting words individually + try: + words = page.crop(crop_box).extract_words() + if words: + text = ' '.join(word['text'] for word in words) + logging.info(f"[{filename}] Page {page_num}: Word extraction found: '{text}'") + texts.append(text) + except Exception as e: + logging.debug(f"[{filename}] Page {page_num}: Word extraction failed: {e}") + + # Method 3: Try extracting characters individually + try: + chars = page.crop(crop_box).chars + if chars: + text = ''.join(char['text'] for char in chars) + logging.info(f"[{filename}] Page {page_num}: Character extraction found: '{text}'") + texts.append(text) + except Exception as e: + logging.debug(f"[{filename}] Page {page_num}: Character extraction failed: {e}") + + # Method 4: Try extracting annotations + try: + annots = page.annots + if annots and isinstance(annots, list): # Fix for the error + for annot in annots: + if isinstance(annot, dict) and 'contents' in annot: + text = annot['contents'] + if text and not isinstance(text, str): + text = str(text) + if text and text.lower() != 'none': + logging.info(f"[{filename}] Page {page_num}: Annotation found: '{text}'") + texts.append(text) + except Exception as e: + logging.debug(f"[{filename}] Page {page_num}: Annotation extraction failed: {e}") + + # Method 5: Try extracting text in reverse order + try: + chars = sorted(page.crop(crop_box).chars, key=lambda x: (-x['top'], x['x0'])) + if chars: + text = ''.join(char['text'] for char in chars) + logging.info(f"[{filename}] Page {page_num}: Reverse order extraction found: '{text}'") + texts.append(text) + except Exception as e: + logging.debug(f"[{filename}] Page {page_num}: Reverse order extraction failed: {e}") + + # Method 6: Last resort - flatten and OCR the crop box + if not texts: try: - logging.info(f"[{filename}] Attempting to extract text from full page") - text = page.extract_text() or "" - logging.info(f"[{filename}] Page {page_num}: Full page text: '{text}'") - return text - except Exception as e2: - logging.error(f"[{filename}] Error extracting text from full page: {str(e2)}") - return "" + logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR") + # Import needed only if we get this far + from pdf2image import convert_from_bytes + import pytesseract + + # Convert just this page to image + with tempfile.NamedTemporaryFile(suffix='.pdf') as tmp_pdf: + # Save just this page to a temporary PDF + writer = pdfplumber.PDF(page.page_obj) + writer.save(tmp_pdf.name) + + # Convert to image + images = convert_from_bytes(open(tmp_pdf.name, 'rb').read()) + if images: + # Crop the image to our area of interest + img = images[0] + img_width, img_height = img.size + crop_box_pixels = ( + int(x0 * img_width / width), + int(y0 * img_height / height), + int(x1 * img_width / width), + int(y1 * img_height / height) + ) + cropped = img.crop(crop_box_pixels) + + # OCR the cropped area + text = pytesseract.image_to_string(cropped) + if text: + logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'") + texts.append(text) + except Exception as e: + logging.debug(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}") + + return texts -def extract_text_from_page_old(page, pdf_path, page_num, use_ocr): - """Extract text from a page, using OCR if enabled and needed.""" - filename = Path(pdf_path).name - # Get page dimensions - width = page.width - height = page.height - - # Calculate crop box as relative position (bottom right corner) - # Use relative positioning and ensure we stay within bounds - x0 = min(width * 0.67, width - 10) # Start at 2/3 of the width, but ensure we stay in bounds - y0 = min(height * 0.83, height - 10) # Start at 5/6 of the height, but ensure we stay in bounds - x1 = width # Full width - y1 = height # Full height - - # Ensure our crop box is within bounds - x0 = max(0, min(x0, width)) - y0 = max(0, min(y0, height)) - x1 = max(0, min(x1, width)) - y1 = max(0, min(y1, height)) - - logging.debug(f"[{filename}] Page {page_num}: dimensions {width}x{height}, crop box: ({x0}, {y0}, {x1}, {y1})") - - try: - text = page.crop((x0, y0, x1, y1)).extract_text() or "" - logging.debug(f"[{filename}] Page {page_num}: extracted text: '{text}'") - - if use_ocr and len(text.split()) < 2: - logging.info(f"[{filename}] Page {page_num}: has less than 2 words, attempting OCR") - text = ocr_page(pdf_path, page_num) - logging.debug(f"[{filename}] Page {page_num}: OCR text: '{text}'") - - return text - except Exception as e: - logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}") - return "" +def find_bates_number(texts, pattern): + """Try to find Bates number in multiple text layers.""" + for text in texts: + matches = list(re.finditer(pattern, text)) + if matches: + return matches[-1] # Return last match if found + return None def extract_bates_numbers(pdf_path, pattern, use_ocr): """Extract Bates numbers from first and last page of PDF using provided pattern.""" @@ -231,34 +262,47 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr): with pdfplumber.open(pdf_path) as pdf: first_page = pdf.pages[0] last_page = pdf.pages[-1] - - logging.debug(f"[{filename}] PDF has {len(pdf.pages)} pages") - - first_text = extract_text_from_page(first_page, pdf_path, 0, use_ocr) - last_text = extract_text_from_page(last_page, pdf_path, len(pdf.pages)-1, use_ocr) - - logging.debug(f"[{filename}] First page text: '{first_text}'") - logging.debug(f"[{filename}] Last page text: '{last_text}'") - - first_matches = list(re.finditer(pattern, first_text)) - last_matches = list(re.finditer(pattern, last_text)) - - logging.debug(f"[{filename}] First page matches: {[m.group(0) for m in first_matches]}") - logging.debug(f"[{filename}] Last page matches: {[m.group(0) for m in last_matches]}") - - first_match = first_matches[-1] if first_matches else None - last_match = last_matches[-1] if last_matches else None - + + # Try all PDF layers first + first_texts = extract_text_from_page_multilayer(first_page, pdf_path, 0) + last_texts = extract_text_from_page_multilayer(last_page, pdf_path, len(pdf.pages)-1) + + first_match = find_bates_number(first_texts, pattern) + last_match = find_bates_number(last_texts, pattern) + + # If no matches found, try flatten and OCR + if not first_match or not last_match: + logging.info(f"[{filename}] No matches in text layers, attempting flatten/OCR") + + # For first page + if not first_match: + try: + flattened_text = flatten_and_ocr_page(first_page, pdf_path, 0) + if flattened_text: + first_texts.append(flattened_text) + matches = list(re.finditer(pattern, flattened_text)) + if matches: + first_match = matches[-1] + except Exception as e: + logging.error(f"[{filename}] Flatten/OCR failed for first page: {e}") + + # For last page + if not last_match: + try: + flattened_text = flatten_and_ocr_page(last_page, pdf_path, len(pdf.pages)-1) + if flattened_text: + last_texts.append(flattened_text) + matches = list(re.finditer(pattern, flattened_text)) + if matches: + last_match = matches[-1] + except Exception as e: + logging.error(f"[{filename}] Flatten/OCR failed for last page: {e}") + if first_match and last_match: - # Extract just the numbers from the full match first_num = ''.join(filter(str.isdigit, first_match.group(0))) last_num = ''.join(filter(str.isdigit, last_match.group(0))) - + logging.info(f"[{filename}] Found numbers: {first_num}–{last_num}") - if len(first_matches) > 1: - logging.debug(f"[{filename}] Multiple matches on first page, using last match. All matches: {[m.group(0) for m in first_matches]}") - if len(last_matches) > 1: - logging.debug(f"[{filename}] Multiple matches on last page, using last match. All matches: {[m.group(0) for m in last_matches]}") return (first_num, last_num) else: logging.warning(f"[{filename}] No matching numbers found") @@ -267,6 +311,64 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr): logging.error(f"[{filename}] Error processing PDF: {str(e)}") return None +def flatten_and_ocr_page(page, pdf_path, page_num): + """Flatten page and OCR the crop box area.""" + filename = Path(pdf_path).name + logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR") + + try: + # Import needed only if we get this far + from pdf2image import convert_from_path + import pytesseract + import PyPDF2 + + # Get page dimensions + width = page.width + height = page.height + + # Calculate crop box for bottom fifth + padding = 2 + y0 = max(0, min(height * 0.8, height - padding)) + y1 = max(y0 + padding, min(height, height)) + x0 = padding + x1 = max(x0 + padding, min(width - padding, width)) + + # Create a single-page PDF with just this page + with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf: + pdf_writer = PyPDF2.PdfWriter() + with open(pdf_path, 'rb') as pdf_file: + pdf_reader = PyPDF2.PdfReader(pdf_file) + pdf_writer.add_page(pdf_reader.pages[page_num]) + pdf_writer.write(tmp_pdf) + tmp_pdf.flush() + + # Convert to image + images = convert_from_path(tmp_pdf.name) + if images: + # Crop the image to our area of interest + img = images[0] + img_width, img_height = img.size + crop_box_pixels = ( + int(x0 * img_width / width), + int(y0 * img_height / height), + int(x1 * img_width / width), + int(y1 * img_height / height) + ) + cropped = img.crop(crop_box_pixels) + + # OCR the cropped area + text = pytesseract.image_to_string(cropped) + if text: + logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'") + return text + + # Clean up the temporary file + os.unlink(tmp_pdf.name) + + except Exception as e: + logging.error(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}") + return None + def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None): """Process all PDFs in the specified folder.""" folder = Path(folder_path) @@ -280,7 +382,10 @@ def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=Non success_count = 0 rename_count = 0 - for pdf_file in folder.glob('*.pdf'): + # Use simple case-insensitive matching + pdf_files = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == '.pdf'] + + for pdf_file in pdf_files: pdf_count += 1 numbers = extract_bates_numbers(pdf_file, pattern, use_ocr) if numbers: