#!/usr/bin/env python3 """ Required packages: pip3 install pdfplumber pytesseract pdf2image # pdf2image and pytesseract only needed if using --ocr System dependencies (only if using --ocr): brew install tesseract poppler # on macOS # or sudo apt-get install tesseract-ocr poppler-utils # on Ubuntu/Debian """ import os import sys import re import argparse import logging from pathlib import Path import tempfile import subprocess import pdfplumber def check_dependencies(ocr_enabled): try: if ocr_enabled: import pytesseract from pdf2image import convert_from_path except ImportError as e: print(f"Missing dependency: {e}") print("Please install required packages:") if ocr_enabled: print("pip3 install pytesseract pdf2image") sys.exit(1) import os import sys import re import argparse import logging from pathlib import Path import tempfile import subprocess def setup_logging(log_level): """Configure logging with the specified level.""" numeric_level = getattr(logging, log_level.upper(), None) if not isinstance(numeric_level, int): raise ValueError(f'Invalid log level: {log_level}') logging.basicConfig( level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) def build_regex_pattern(prefix, num_digits): """Build regex pattern based on prefix and number of digits.""" # Escape any special regex characters in the prefix escaped_prefix = re.escape(prefix) # Pattern matches the prefix followed by exactly num_digits digits # and ensures no digits or letters follow pattern = f"{escaped_prefix}\\d{{{num_digits}}}(?![\\d\\w])" logging.debug(f"Generated regex pattern: {pattern}") return pattern def set_finder_comment(file_path, comment): """Set the Finder comment for a file using osascript.""" try: # Escape special characters in both the file path and comment escaped_path = str(file_path).replace('"', '\\"').replace("'", "'\\''") escaped_comment = comment.replace('"', '\\"').replace("'", "'\\''") script = f''' osascript -e 'tell application "Finder" set commentPath to POSIX file "{escaped_path}" as alias set comment of commentPath to "{escaped_comment}" end tell' ''' subprocess.run(script, shell=True, check=True, stderr=subprocess.PIPE) logging.debug(f"Set Finder comment for {file_path} to: {comment}") return True except subprocess.CalledProcessError as e: logging.error(f"Failed to set Finder comment for {file_path}: {e.stderr.decode()}") return False except Exception as e: logging.error(f"Failed to set Finder comment for {file_path}: {e}") return False def rename_with_bates(file_path, name_prefix, first_num, last_num): """Rename file using Bates numbers and preserve original name in metadata.""" try: path = Path(file_path) original_name = path.name new_name = f"{name_prefix}{first_num}–{last_num}{path.suffix}" new_path = path.parent / new_name # First try to set the metadata if not set_finder_comment(file_path, original_name): logging.error(f"Skipping rename of {file_path} due to metadata failure") return False # Then rename the file path.rename(new_path) logging.info(f"Renamed {original_name} to {new_name}") return True except Exception as e: logging.error(f"Failed to rename {file_path}: {e}") return False def ocr_page(pdf_path, page_num): """OCR a specific page of a PDF.""" filename = Path(pdf_path).name logging.debug(f"[{filename}] Running OCR on page {page_num}") try: # Import OCR-related modules only when needed import pytesseract from pdf2image import convert_from_path # Convert specific page to image images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1) if not images: logging.error(f"[{filename}] Failed to convert page {page_num} to image") return "" # OCR the image with tempfile.NamedTemporaryFile(suffix='.png') as tmp: images[0].save(tmp.name, 'PNG') text = pytesseract.image_to_string(tmp.name) logging.debug(f"[{filename}] Page {page_num} OCR result: '{text}'") return text except Exception as e: logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}") return "" def extract_text_from_page_multilayer(page, pdf_path, page_num): """Extract text from different PDF layers.""" filename = Path(pdf_path).name # Get page dimensions width = page.width height = page.height # Calculate crop box for bottom fifth of page padding = 2 y0 = max(0, min(height * 0.8, height - padding)) y1 = max(y0 + padding, min(height, height)) x0 = padding x1 = max(x0 + padding, min(width - padding, width)) crop_box = (x0, y0, x1, y1) logging.info(f"[{filename}] Page {page_num}: Dimensions {width}x{height}, crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})") texts = [] # Method 1: Try regular text extraction try: text = page.crop(crop_box).extract_text() if text: logging.info(f"[{filename}] Page {page_num}: Regular extraction found: '{text}'") texts.append(text) except Exception as e: logging.debug(f"[{filename}] Page {page_num}: Regular text extraction failed: {e}") # Method 2: Try extracting words individually try: words = page.crop(crop_box).extract_words() if words: text = ' '.join(word['text'] for word in words) logging.info(f"[{filename}] Page {page_num}: Word extraction found: '{text}'") texts.append(text) except Exception as e: logging.debug(f"[{filename}] Page {page_num}: Word extraction failed: {e}") # Method 3: Try extracting characters individually try: chars = page.crop(crop_box).chars if chars: text = ''.join(char['text'] for char in chars) logging.info(f"[{filename}] Page {page_num}: Character extraction found: '{text}'") texts.append(text) except Exception as e: logging.debug(f"[{filename}] Page {page_num}: Character extraction failed: {e}") # Method 4: Try extracting annotations try: annots = page.annots if annots and isinstance(annots, list): # Fix for the error for annot in annots: if isinstance(annot, dict) and 'contents' in annot: text = annot['contents'] if text and not isinstance(text, str): text = str(text) if text and text.lower() != 'none': logging.info(f"[{filename}] Page {page_num}: Annotation found: '{text}'") texts.append(text) except Exception as e: logging.debug(f"[{filename}] Page {page_num}: Annotation extraction failed: {e}") # Method 5: Try extracting text in reverse order try: chars = sorted(page.crop(crop_box).chars, key=lambda x: (-x['top'], x['x0'])) if chars: text = ''.join(char['text'] for char in chars) logging.info(f"[{filename}] Page {page_num}: Reverse order extraction found: '{text}'") texts.append(text) except Exception as e: logging.debug(f"[{filename}] Page {page_num}: Reverse order extraction failed: {e}") # Method 6: Last resort - flatten and OCR the crop box if not texts: try: logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR") # Import needed only if we get this far from pdf2image import convert_from_bytes import pytesseract # Convert just this page to image with tempfile.NamedTemporaryFile(suffix='.pdf') as tmp_pdf: # Save just this page to a temporary PDF writer = pdfplumber.PDF(page.page_obj) writer.save(tmp_pdf.name) # Convert to image images = convert_from_bytes(open(tmp_pdf.name, 'rb').read()) if images: # Crop the image to our area of interest img = images[0] img_width, img_height = img.size crop_box_pixels = ( int(x0 * img_width / width), int(y0 * img_height / height), int(x1 * img_width / width), int(y1 * img_height / height) ) cropped = img.crop(crop_box_pixels) # OCR the cropped area text = pytesseract.image_to_string(cropped) if text: logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'") texts.append(text) except Exception as e: logging.debug(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}") return texts def find_bates_number(texts, pattern): """Try to find Bates number in multiple text layers.""" for text in texts: matches = list(re.finditer(pattern, text)) if matches: return matches[-1] # Return last match if found return None def extract_bates_numbers(pdf_path, pattern, use_ocr): """Extract Bates numbers from first and last page of PDF using provided pattern.""" filename = Path(pdf_path).name logging.info(f"[{filename}] Processing PDF") try: with pdfplumber.open(pdf_path) as pdf: first_page = pdf.pages[0] last_page = pdf.pages[-1] # Try all PDF layers first first_texts = extract_text_from_page_multilayer(first_page, pdf_path, 0) last_texts = extract_text_from_page_multilayer(last_page, pdf_path, len(pdf.pages)-1) first_match = find_bates_number(first_texts, pattern) last_match = find_bates_number(last_texts, pattern) # If no matches found, try flatten and OCR if not first_match or not last_match: logging.info(f"[{filename}] No matches in text layers, attempting flatten/OCR") # For first page if not first_match: try: flattened_text = flatten_and_ocr_page(first_page, pdf_path, 0) if flattened_text: first_texts.append(flattened_text) matches = list(re.finditer(pattern, flattened_text)) if matches: first_match = matches[-1] except Exception as e: logging.error(f"[{filename}] Flatten/OCR failed for first page: {e}") # For last page if not last_match: try: flattened_text = flatten_and_ocr_page(last_page, pdf_path, len(pdf.pages)-1) if flattened_text: last_texts.append(flattened_text) matches = list(re.finditer(pattern, flattened_text)) if matches: last_match = matches[-1] except Exception as e: logging.error(f"[{filename}] Flatten/OCR failed for last page: {e}") if first_match and last_match: first_num = ''.join(filter(str.isdigit, first_match.group(0))) last_num = ''.join(filter(str.isdigit, last_match.group(0))) logging.info(f"[{filename}] Found numbers: {first_num}–{last_num}") return (first_num, last_num) else: logging.warning(f"[{filename}] No matching numbers found") return None except Exception as e: logging.error(f"[{filename}] Error processing PDF: {str(e)}") return None def flatten_and_ocr_page(page, pdf_path, page_num): """Flatten page and OCR the crop box area.""" filename = Path(pdf_path).name logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR") try: # Import needed only if we get this far from pdf2image import convert_from_path import pytesseract import PyPDF2 # Get page dimensions width = page.width height = page.height # Calculate crop box for bottom fifth padding = 2 y0 = max(0, min(height * 0.8, height - padding)) y1 = max(y0 + padding, min(height, height)) x0 = padding x1 = max(x0 + padding, min(width - padding, width)) # Create a single-page PDF with just this page with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf: pdf_writer = PyPDF2.PdfWriter() with open(pdf_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) pdf_writer.add_page(pdf_reader.pages[page_num]) pdf_writer.write(tmp_pdf) tmp_pdf.flush() # Convert to image images = convert_from_path(tmp_pdf.name) if images: # Crop the image to our area of interest img = images[0] img_width, img_height = img.size crop_box_pixels = ( int(x0 * img_width / width), int(y0 * img_height / height), int(x1 * img_width / width), int(y1 * img_height / height) ) cropped = img.crop(crop_box_pixels) # OCR the cropped area text = pytesseract.image_to_string(cropped) if text: logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'") return text # Clean up the temporary file os.unlink(tmp_pdf.name) except Exception as e: logging.error(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}") return None def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None): """Process all PDFs in the specified folder.""" folder = Path(folder_path) if not folder.exists(): logging.error(f"Folder does not exist: {folder_path}") return logging.info(f"Processing folder: {folder_path}") pdf_count = 0 success_count = 0 rename_count = 0 # Use simple case-insensitive matching pdf_files = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == '.pdf'] for pdf_file in pdf_files: pdf_count += 1 numbers = extract_bates_numbers(pdf_file, pattern, use_ocr) if numbers: success_count += 1 if dry_run: print(f"{pdf_file.name}: {numbers[0]}–{numbers[1]}") elif name_prefix is not None: if rename_with_bates(pdf_file, name_prefix, numbers[0], numbers[1]): rename_count += 1 logging.info(f"Processed {pdf_count} PDFs, successfully extracted {success_count} number pairs") if not dry_run and name_prefix is not None: logging.info(f"Renamed {rename_count} files") def main(): parser = argparse.ArgumentParser(description='Extract Bates numbers from PDFs') parser.add_argument('folder', help='Path to folder containing PDFs') parser.add_argument('--log', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level') parser.add_argument('--width-start', type=float, default=0.67, help='Relative x-coordinate to start crop (0-1)') parser.add_argument('--height-start', type=float, default=0.83, help='Relative y-coordinate to start crop (0-1)') parser.add_argument('--prefix', type=str, default='FWS-', help='Prefix pattern to search for (default: "FWS-")') parser.add_argument('--digits', type=int, default=6, help='Number of digits to match after prefix (default: 6)') parser.add_argument('--ocr', action='store_true', help='Enable OCR for pages with little or no text (disabled by default)') parser.add_argument('--dry-run', action='store_true', help='Only print matches without renaming files') parser.add_argument('--name-prefix', type=str, help='Prefix to use when renaming files (e.g., "FWS ")') args = parser.parse_args() setup_logging(args.log) # Check dependencies based on whether OCR is enabled check_dependencies(args.ocr) # Display the pattern we're looking for display_pattern = f"{args.prefix}{'#' * args.digits}" print(f"Looking for pattern: {display_pattern}") if not args.dry_run and args.name_prefix is None: logging.error("Must specify --name-prefix when not in dry-run mode") sys.exit(1) pattern = build_regex_pattern(args.prefix, args.digits) process_folder(args.folder, pattern, args.ocr, args.dry_run, args.name_prefix) if __name__ == '__main__': main()