Auto-update: Sat Oct 26 13:45:39 PDT 2024

2024-10-26 13:45:39 -07:00 · 2024-10-26 13:45:39 -07:00 · 99d919cb27
commit 99d919cb27
parent e4dc0ab99a
1 changed files with 202 additions and 97 deletions
--- a/283
+++ b/283
@ -133,95 +133,126 @@ def ocr_page(pdf_path, page_num):
        logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}")
        return ""
-def extract_text_from_page(page, pdf_path, page_num, use_ocr):
+def extract_text_from_page_multilayer(page, pdf_path, page_num):
-    """Extract text from a page, using OCR if enabled and needed."""
+    """Extract text from different PDF layers."""
    filename = Path(pdf_path).name
    # Get page dimensions
    width = page.width
    height = page.height
    # Calculate crop box for bottom fifth of page
-    padding = 2  # 2 point padding
+    padding = 2
    # Start at 80% down the page (leaving bottom fifth)
    y0 = max(0, min(height * 0.8, height - padding))
    y1 = max(y0 + padding, min(height, height))
    # Use full width
    x0 = padding
    x1 = max(x0 + padding, min(width - padding, width))
-    # Ensure the crop box makes sense
+    crop_box = (x0, y0, x1, y1)
    if x1 <= x0 or y1 <= y0:
        logging.warning(f"[{filename}] Page {page_num}: Invalid crop box dimensions, using full page")
        x0, y0 = 0, 0
        x1, y1 = width, height
-    logging.info(f"[{filename}] Page {page_num}: Page size: {width}x{height} points")
+    logging.info(f"[{filename}] Page {page_num}: Dimensions {width}x{height}, crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
    logging.info(f"[{filename}] Page {page_num}: Crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
    texts = []
    # Method 1: Try regular text extraction
    try:
-        # Extract text from the crop box
+        text = page.crop(crop_box).extract_text()
-        cropped_text = page.crop((x0, y0, x1, y1)).extract_text() or ""
+        if text:
-        logging.info(f"[{filename}] Page {page_num}: Cropped text: '{cropped_text}'")
+            logging.info(f"[{filename}] Page {page_num}: Regular extraction found: '{text}'")
-        
+            texts.append(text)
        # If we don't find anything in the crop, try the full page
        if not cropped_text.strip():
            logging.info(f"[{filename}] Page {page_num}: No text in crop box, trying full page")
            full_text = page.extract_text() or ""
            logging.info(f"[{filename}] Page {page_num}: Full page text: '{full_text}'")
            return full_text
        return cropped_text
    except Exception as e:
-        logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
+        logging.debug(f"[{filename}] Page {page_num}: Regular text extraction failed: {e}")
-        # If crop fails, try extracting text from the entire page
+
    # Method 2: Try extracting words individually
    try:
        words = page.crop(crop_box).extract_words()
        if words:
            text = ' '.join(word['text'] for word in words)
            logging.info(f"[{filename}] Page {page_num}: Word extraction found: '{text}'")
            texts.append(text)
    except Exception as e:
        logging.debug(f"[{filename}] Page {page_num}: Word extraction failed: {e}")
    # Method 3: Try extracting characters individually
    try:
        chars = page.crop(crop_box).chars
        if chars:
            text = ''.join(char['text'] for char in chars)
            logging.info(f"[{filename}] Page {page_num}: Character extraction found: '{text}'")
            texts.append(text)
    except Exception as e:
        logging.debug(f"[{filename}] Page {page_num}: Character extraction failed: {e}")
    # Method 4: Try extracting annotations
    try:
        annots = page.annots
        if annots and isinstance(annots, list):  # Fix for the error
            for annot in annots:
                if isinstance(annot, dict) and 'contents' in annot:
                    text = annot['contents']
                    if text and not isinstance(text, str):
                        text = str(text)
                    if text and text.lower() != 'none':
                        logging.info(f"[{filename}] Page {page_num}: Annotation found: '{text}'")
                        texts.append(text)
    except Exception as e:
        logging.debug(f"[{filename}] Page {page_num}: Annotation extraction failed: {e}")
    # Method 5: Try extracting text in reverse order
    try:
        chars = sorted(page.crop(crop_box).chars, key=lambda x: (-x['top'], x['x0']))
        if chars:
            text = ''.join(char['text'] for char in chars)
            logging.info(f"[{filename}] Page {page_num}: Reverse order extraction found: '{text}'")
            texts.append(text)
    except Exception as e:
        logging.debug(f"[{filename}] Page {page_num}: Reverse order extraction failed: {e}")
    # Method 6: Last resort - flatten and OCR the crop box
    if not texts:
        try:
-            logging.info(f"[{filename}] Attempting to extract text from full page")
+            logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
-            text = page.extract_text() or ""
+            # Import needed only if we get this far
-            logging.info(f"[{filename}] Page {page_num}: Full page text: '{text}'")
+            from pdf2image import convert_from_bytes
-            return text
+            import pytesseract
-        except Exception as e2:
+            
-            logging.error(f"[{filename}] Error extracting text from full page: {str(e2)}")
+            # Convert just this page to image
-            return ""
+            with tempfile.NamedTemporaryFile(suffix='.pdf') as tmp_pdf:
                # Save just this page to a temporary PDF
                writer = pdfplumber.PDF(page.page_obj)
                writer.save(tmp_pdf.name)
                # Convert to image
                images = convert_from_bytes(open(tmp_pdf.name, 'rb').read())
                if images:
                    # Crop the image to our area of interest
                    img = images[0]
                    img_width, img_height = img.size
                    crop_box_pixels = (
                        int(x0 * img_width / width),
                        int(y0 * img_height / height),
                        int(x1 * img_width / width),
                        int(y1 * img_height / height)
                    )
                    cropped = img.crop(crop_box_pixels)
                    # OCR the cropped area
                    text = pytesseract.image_to_string(cropped)
                    if text:
                        logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
                        texts.append(text)
        except Exception as e:
            logging.debug(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
    return texts
-def extract_text_from_page_old(page, pdf_path, page_num, use_ocr):
+def find_bates_number(texts, pattern):
-    """Extract text from a page, using OCR if enabled and needed."""
+    """Try to find Bates number in multiple text layers."""
-    filename = Path(pdf_path).name
+    for text in texts:
-    # Get page dimensions
+        matches = list(re.finditer(pattern, text))
-    width = page.width
+        if matches:
-    height = page.height
+            return matches[-1]  # Return last match if found
-    
+    return None
    # Calculate crop box as relative position (bottom right corner)
    # Use relative positioning and ensure we stay within bounds
    x0 = min(width * 0.67, width - 10)  # Start at 2/3 of the width, but ensure we stay in bounds
    y0 = min(height * 0.83, height - 10)  # Start at 5/6 of the height, but ensure we stay in bounds
    x1 = width  # Full width
    y1 = height  # Full height
    # Ensure our crop box is within bounds
    x0 = max(0, min(x0, width))
    y0 = max(0, min(y0, height))
    x1 = max(0, min(x1, width))
    y1 = max(0, min(y1, height))
    logging.debug(f"[{filename}] Page {page_num}: dimensions {width}x{height}, crop box: ({x0}, {y0}, {x1}, {y1})")
    try:
        text = page.crop((x0, y0, x1, y1)).extract_text() or ""
        logging.debug(f"[{filename}] Page {page_num}: extracted text: '{text}'")
        if use_ocr and len(text.split()) < 2:
            logging.info(f"[{filename}] Page {page_num}: has less than 2 words, attempting OCR")
            text = ocr_page(pdf_path, page_num)
            logging.debug(f"[{filename}] Page {page_num}: OCR text: '{text}'")
        return text
    except Exception as e:
        logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
        return ""
 def extract_bates_numbers(pdf_path, pattern, use_ocr):
    """Extract Bates numbers from first and last page of PDF using provided pattern."""
@ -232,33 +263,46 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
            first_page = pdf.pages[0]
            last_page = pdf.pages[-1]
-            logging.debug(f"[{filename}] PDF has {len(pdf.pages)} pages")
+            # Try all PDF layers first
            first_texts = extract_text_from_page_multilayer(first_page, pdf_path, 0)
            last_texts = extract_text_from_page_multilayer(last_page, pdf_path, len(pdf.pages)-1)
-            first_text = extract_text_from_page(first_page, pdf_path, 0, use_ocr)
+            first_match = find_bates_number(first_texts, pattern)
-            last_text = extract_text_from_page(last_page, pdf_path, len(pdf.pages)-1, use_ocr)
+            last_match = find_bates_number(last_texts, pattern)
-            logging.debug(f"[{filename}] First page text: '{first_text}'")
+            # If no matches found, try flatten and OCR
-            logging.debug(f"[{filename}] Last page text: '{last_text}'")
+            if not first_match or not last_match:
                logging.info(f"[{filename}] No matches in text layers, attempting flatten/OCR")
-            first_matches = list(re.finditer(pattern, first_text))
+                # For first page
-            last_matches = list(re.finditer(pattern, last_text))
+                if not first_match:
                    try:
                        flattened_text = flatten_and_ocr_page(first_page, pdf_path, 0)
                        if flattened_text:
                            first_texts.append(flattened_text)
                            matches = list(re.finditer(pattern, flattened_text))
                            if matches:
                                first_match = matches[-1]
                    except Exception as e:
                        logging.error(f"[{filename}] Flatten/OCR failed for first page: {e}")
-            logging.debug(f"[{filename}] First page matches: {[m.group(0) for m in first_matches]}")
+                # For last page
-            logging.debug(f"[{filename}] Last page matches: {[m.group(0) for m in last_matches]}")
+                if not last_match:
-            
+                    try:
-            first_match = first_matches[-1] if first_matches else None
+                        flattened_text = flatten_and_ocr_page(last_page, pdf_path, len(pdf.pages)-1)
-            last_match = last_matches[-1] if last_matches else None
+                        if flattened_text:
                            last_texts.append(flattened_text)
                            matches = list(re.finditer(pattern, flattened_text))
                            if matches:
                                last_match = matches[-1]
                    except Exception as e:
                        logging.error(f"[{filename}] Flatten/OCR failed for last page: {e}")
            if first_match and last_match:
                # Extract just the numbers from the full match
                first_num = ''.join(filter(str.isdigit, first_match.group(0)))
                last_num = ''.join(filter(str.isdigit, last_match.group(0)))
                logging.info(f"[{filename}] Found numbers: {first_num}–{last_num}")
                if len(first_matches) > 1:
                    logging.debug(f"[{filename}] Multiple matches on first page, using last match. All matches: {[m.group(0) for m in first_matches]}")
                if len(last_matches) > 1:
                    logging.debug(f"[{filename}] Multiple matches on last page, using last match. All matches: {[m.group(0) for m in last_matches]}")
                return (first_num, last_num)
            else:
                logging.warning(f"[{filename}] No matching numbers found")
@ -267,6 +311,64 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
        logging.error(f"[{filename}] Error processing PDF: {str(e)}")
        return None
 def flatten_and_ocr_page(page, pdf_path, page_num):
    """Flatten page and OCR the crop box area."""
    filename = Path(pdf_path).name
    logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
    try:
        # Import needed only if we get this far
        from pdf2image import convert_from_path
        import pytesseract
        import PyPDF2
        # Get page dimensions
        width = page.width
        height = page.height
        # Calculate crop box for bottom fifth
        padding = 2
        y0 = max(0, min(height * 0.8, height - padding))
        y1 = max(y0 + padding, min(height, height))
        x0 = padding
        x1 = max(x0 + padding, min(width - padding, width))
        # Create a single-page PDF with just this page
        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
            pdf_writer = PyPDF2.PdfWriter()
            with open(pdf_path, 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                pdf_writer.add_page(pdf_reader.pages[page_num])
                pdf_writer.write(tmp_pdf)
                tmp_pdf.flush()
            # Convert to image
            images = convert_from_path(tmp_pdf.name)
            if images:
                # Crop the image to our area of interest
                img = images[0]
                img_width, img_height = img.size
                crop_box_pixels = (
                    int(x0 * img_width / width),
                    int(y0 * img_height / height),
                    int(x1 * img_width / width),
                    int(y1 * img_height / height)
                )
                cropped = img.crop(crop_box_pixels)
                # OCR the cropped area
                text = pytesseract.image_to_string(cropped)
                if text:
                    logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
                    return text
        # Clean up the temporary file
        os.unlink(tmp_pdf.name)
    except Exception as e:
        logging.error(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
        return None
 def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None):
    """Process all PDFs in the specified folder."""
    folder = Path(folder_path)
@ -280,7 +382,10 @@ def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=Non
    success_count = 0
    rename_count = 0
-    for pdf_file in folder.glob('*.pdf'):
+    # Use simple case-insensitive matching
    pdf_files = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == '.pdf']
    for pdf_file in pdf_files:
        pdf_count += 1
        numbers = extract_bates_numbers(pdf_file, pattern, use_ocr)
        if numbers: