Auto-update: Sat Oct 26 13:45:39 PDT 2024

2024-10-26 13:45:39 -07:00 · 2024-10-26 13:45:39 -07:00 · 99d919cb27
commit 99d919cb27
parent e4dc0ab99a
1 changed files with 202 additions and 97 deletions
--- a/299
+++ b/299
@ -133,95 +133,126 @@ def ocr_page(pdf_path, page_num):
        logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}")
        return ""

-def extract_text_from_page(page, pdf_path, page_num, use_ocr):
-    """Extract text from a page, using OCR if enabled and needed."""
+def extract_text_from_page_multilayer(page, pdf_path, page_num):
+    """Extract text from different PDF layers."""
    filename = Path(pdf_path).name
    # Get page dimensions
    width = page.width
    height = page.height

    # Calculate crop box for bottom fifth of page
-    padding = 2  # 2 point padding
-    
-    # Start at 80% down the page (leaving bottom fifth)
+    padding = 2
    y0 = max(0, min(height * 0.8, height - padding))
    y1 = max(y0 + padding, min(height, height))
-    
-    # Use full width
    x0 = padding
    x1 = max(x0 + padding, min(width - padding, width))

-    # Ensure the crop box makes sense
-    if x1 <= x0 or y1 <= y0:
-        logging.warning(f"[{filename}] Page {page_num}: Invalid crop box dimensions, using full page")
-        x0, y0 = 0, 0
-        x1, y1 = width, height
+    crop_box = (x0, y0, x1, y1)

-    logging.info(f"[{filename}] Page {page_num}: Page size: {width}x{height} points")
-    logging.info(f"[{filename}] Page {page_num}: Crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
-    
+    logging.info(f"[{filename}] Page {page_num}: Dimensions {width}x{height}, crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
+
+    texts = []
+
+    # Method 1: Try regular text extraction
    try:
-        # Extract text from the crop box
-        cropped_text = page.crop((x0, y0, x1, y1)).extract_text() or ""
-        logging.info(f"[{filename}] Page {page_num}: Cropped text: '{cropped_text}'")
-        
-        # If we don't find anything in the crop, try the full page
-        if not cropped_text.strip():
-            logging.info(f"[{filename}] Page {page_num}: No text in crop box, trying full page")
-            full_text = page.extract_text() or ""
-            logging.info(f"[{filename}] Page {page_num}: Full page text: '{full_text}'")
-            return full_text
-        
-        return cropped_text
-        
+        text = page.crop(crop_box).extract_text()
+        if text:
+            logging.info(f"[{filename}] Page {page_num}: Regular extraction found: '{text}'")
+            texts.append(text)
    except Exception as e:
-        logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
-        # If crop fails, try extracting text from the entire page
+        logging.debug(f"[{filename}] Page {page_num}: Regular text extraction failed: {e}")
+
+    # Method 2: Try extracting words individually
+    try:
+        words = page.crop(crop_box).extract_words()
+        if words:
+            text = ' '.join(word['text'] for word in words)
+            logging.info(f"[{filename}] Page {page_num}: Word extraction found: '{text}'")
+            texts.append(text)
+    except Exception as e:
+        logging.debug(f"[{filename}] Page {page_num}: Word extraction failed: {e}")
+
+    # Method 3: Try extracting characters individually
+    try:
+        chars = page.crop(crop_box).chars
+        if chars:
+            text = ''.join(char['text'] for char in chars)
+            logging.info(f"[{filename}] Page {page_num}: Character extraction found: '{text}'")
+            texts.append(text)
+    except Exception as e:
+        logging.debug(f"[{filename}] Page {page_num}: Character extraction failed: {e}")
+
+    # Method 4: Try extracting annotations
+    try:
+        annots = page.annots
+        if annots and isinstance(annots, list):  # Fix for the error
+            for annot in annots:
+                if isinstance(annot, dict) and 'contents' in annot:
+                    text = annot['contents']
+                    if text and not isinstance(text, str):
+                        text = str(text)
+                    if text and text.lower() != 'none':
+                        logging.info(f"[{filename}] Page {page_num}: Annotation found: '{text}'")
+                        texts.append(text)
+    except Exception as e:
+        logging.debug(f"[{filename}] Page {page_num}: Annotation extraction failed: {e}")
+
+    # Method 5: Try extracting text in reverse order
+    try:
+        chars = sorted(page.crop(crop_box).chars, key=lambda x: (-x['top'], x['x0']))
+        if chars:
+            text = ''.join(char['text'] for char in chars)
+            logging.info(f"[{filename}] Page {page_num}: Reverse order extraction found: '{text}'")
+            texts.append(text)
+    except Exception as e:
+        logging.debug(f"[{filename}] Page {page_num}: Reverse order extraction failed: {e}")
+
+    # Method 6: Last resort - flatten and OCR the crop box
+    if not texts:
        try:
-            logging.info(f"[{filename}] Attempting to extract text from full page")
-            text = page.extract_text() or ""
-            logging.info(f"[{filename}] Page {page_num}: Full page text: '{text}'")
-            return text
-        except Exception as e2:
-            logging.error(f"[{filename}] Error extracting text from full page: {str(e2)}")
-            return ""
+            logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
+            # Import needed only if we get this far
+            from pdf2image import convert_from_bytes
+            import pytesseract
+            
+            # Convert just this page to image
+            with tempfile.NamedTemporaryFile(suffix='.pdf') as tmp_pdf:
+                # Save just this page to a temporary PDF
+                writer = pdfplumber.PDF(page.page_obj)
+                writer.save(tmp_pdf.name)
+                
+                # Convert to image
+                images = convert_from_bytes(open(tmp_pdf.name, 'rb').read())
+                if images:
+                    # Crop the image to our area of interest
+                    img = images[0]
+                    img_width, img_height = img.size
+                    crop_box_pixels = (
+                        int(x0 * img_width / width),
+                        int(y0 * img_height / height),
+                        int(x1 * img_width / width),
+                        int(y1 * img_height / height)
+                    )
+                    cropped = img.crop(crop_box_pixels)
+                    
+                    # OCR the cropped area
+                    text = pytesseract.image_to_string(cropped)
+                    if text:
+                        logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
+                        texts.append(text)
+        except Exception as e:
+            logging.debug(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
+
+    return texts


-def extract_text_from_page_old(page, pdf_path, page_num, use_ocr):
-    """Extract text from a page, using OCR if enabled and needed."""
-    filename = Path(pdf_path).name
-    # Get page dimensions
-    width = page.width
-    height = page.height
-    
-    # Calculate crop box as relative position (bottom right corner)
-    # Use relative positioning and ensure we stay within bounds
-    x0 = min(width * 0.67, width - 10)  # Start at 2/3 of the width, but ensure we stay in bounds
-    y0 = min(height * 0.83, height - 10)  # Start at 5/6 of the height, but ensure we stay in bounds
-    x1 = width  # Full width
-    y1 = height  # Full height
-    
-    # Ensure our crop box is within bounds
-    x0 = max(0, min(x0, width))
-    y0 = max(0, min(y0, height))
-    x1 = max(0, min(x1, width))
-    y1 = max(0, min(y1, height))
-    
-    logging.debug(f"[{filename}] Page {page_num}: dimensions {width}x{height}, crop box: ({x0}, {y0}, {x1}, {y1})")
-    
-    try:
-        text = page.crop((x0, y0, x1, y1)).extract_text() or ""
-        logging.debug(f"[{filename}] Page {page_num}: extracted text: '{text}'")
-        
-        if use_ocr and len(text.split()) < 2:
-            logging.info(f"[{filename}] Page {page_num}: has less than 2 words, attempting OCR")
-            text = ocr_page(pdf_path, page_num)
-            logging.debug(f"[{filename}] Page {page_num}: OCR text: '{text}'")
-        
-        return text
-    except Exception as e:
-        logging.error(f"[{filename}] Error extracting text from page {page_num}: {str(e)}")
-        return ""
+def find_bates_number(texts, pattern):
+    """Try to find Bates number in multiple text layers."""
+    for text in texts:
+        matches = list(re.finditer(pattern, text))
+        if matches:
+            return matches[-1]  # Return last match if found
+    return None

 def extract_bates_numbers(pdf_path, pattern, use_ocr):
    """Extract Bates numbers from first and last page of PDF using provided pattern."""
@ -231,34 +262,47 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
        with pdfplumber.open(pdf_path) as pdf:
            first_page = pdf.pages[0]
            last_page = pdf.pages[-1]
-            
-            logging.debug(f"[{filename}] PDF has {len(pdf.pages)} pages")
-            
-            first_text = extract_text_from_page(first_page, pdf_path, 0, use_ocr)
-            last_text = extract_text_from_page(last_page, pdf_path, len(pdf.pages)-1, use_ocr)
-            
-            logging.debug(f"[{filename}] First page text: '{first_text}'")
-            logging.debug(f"[{filename}] Last page text: '{last_text}'")
-            
-            first_matches = list(re.finditer(pattern, first_text))
-            last_matches = list(re.finditer(pattern, last_text))
-            
-            logging.debug(f"[{filename}] First page matches: {[m.group(0) for m in first_matches]}")
-            logging.debug(f"[{filename}] Last page matches: {[m.group(0) for m in last_matches]}")
-            
-            first_match = first_matches[-1] if first_matches else None
-            last_match = last_matches[-1] if last_matches else None
-            
+
+            # Try all PDF layers first
+            first_texts = extract_text_from_page_multilayer(first_page, pdf_path, 0)
+            last_texts = extract_text_from_page_multilayer(last_page, pdf_path, len(pdf.pages)-1)
+
+            first_match = find_bates_number(first_texts, pattern)
+            last_match = find_bates_number(last_texts, pattern)
+
+            # If no matches found, try flatten and OCR
+            if not first_match or not last_match:
+                logging.info(f"[{filename}] No matches in text layers, attempting flatten/OCR")
+                
+                # For first page
+                if not first_match:
+                    try:
+                        flattened_text = flatten_and_ocr_page(first_page, pdf_path, 0)
+                        if flattened_text:
+                            first_texts.append(flattened_text)
+                            matches = list(re.finditer(pattern, flattened_text))
+                            if matches:
+                                first_match = matches[-1]
+                    except Exception as e:
+                        logging.error(f"[{filename}] Flatten/OCR failed for first page: {e}")
+
+                # For last page
+                if not last_match:
+                    try:
+                        flattened_text = flatten_and_ocr_page(last_page, pdf_path, len(pdf.pages)-1)
+                        if flattened_text:
+                            last_texts.append(flattened_text)
+                            matches = list(re.finditer(pattern, flattened_text))
+                            if matches:
+                                last_match = matches[-1]
+                    except Exception as e:
+                        logging.error(f"[{filename}] Flatten/OCR failed for last page: {e}")
+
            if first_match and last_match:
-                # Extract just the numbers from the full match
                first_num = ''.join(filter(str.isdigit, first_match.group(0)))
                last_num = ''.join(filter(str.isdigit, last_match.group(0)))
-                
+
                logging.info(f"[{filename}] Found numbers: {first_num}–{last_num}")
-                if len(first_matches) > 1:
-                    logging.debug(f"[{filename}] Multiple matches on first page, using last match. All matches: {[m.group(0) for m in first_matches]}")
-                if len(last_matches) > 1:
-                    logging.debug(f"[{filename}] Multiple matches on last page, using last match. All matches: {[m.group(0) for m in last_matches]}")
                return (first_num, last_num)
            else:
                logging.warning(f"[{filename}] No matching numbers found")
@ -267,6 +311,64 @@ def extract_bates_numbers(pdf_path, pattern, use_ocr):
        logging.error(f"[{filename}] Error processing PDF: {str(e)}")
        return None

+def flatten_and_ocr_page(page, pdf_path, page_num):
+    """Flatten page and OCR the crop box area."""
+    filename = Path(pdf_path).name
+    logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
+    
+    try:
+        # Import needed only if we get this far
+        from pdf2image import convert_from_path
+        import pytesseract
+        import PyPDF2
+        
+        # Get page dimensions
+        width = page.width
+        height = page.height
+        
+        # Calculate crop box for bottom fifth
+        padding = 2
+        y0 = max(0, min(height * 0.8, height - padding))
+        y1 = max(y0 + padding, min(height, height))
+        x0 = padding
+        x1 = max(x0 + padding, min(width - padding, width))
+        
+        # Create a single-page PDF with just this page
+        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
+            pdf_writer = PyPDF2.PdfWriter()
+            with open(pdf_path, 'rb') as pdf_file:
+                pdf_reader = PyPDF2.PdfReader(pdf_file)
+                pdf_writer.add_page(pdf_reader.pages[page_num])
+                pdf_writer.write(tmp_pdf)
+                tmp_pdf.flush()
+            
+            # Convert to image
+            images = convert_from_path(tmp_pdf.name)
+            if images:
+                # Crop the image to our area of interest
+                img = images[0]
+                img_width, img_height = img.size
+                crop_box_pixels = (
+                    int(x0 * img_width / width),
+                    int(y0 * img_height / height),
+                    int(x1 * img_width / width),
+                    int(y1 * img_height / height)
+                )
+                cropped = img.crop(crop_box_pixels)
+                
+                # OCR the cropped area
+                text = pytesseract.image_to_string(cropped)
+                if text:
+                    logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
+                    return text
+        
+        # Clean up the temporary file
+        os.unlink(tmp_pdf.name)
+        
+    except Exception as e:
+        logging.error(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
+        return None
+
 def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None):
    """Process all PDFs in the specified folder."""
    folder = Path(folder_path)
@ -280,7 +382,10 @@ def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=Non
    success_count = 0
    rename_count = 0
    
-    for pdf_file in folder.glob('*.pdf'):
+    # Use simple case-insensitive matching
+    pdf_files = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == '.pdf']
+    
+    for pdf_file in pdf_files:
        pdf_count += 1
        numbers = extract_bates_numbers(pdf_file, pattern, use_ocr)
        if numbers: