2024-10-26 20:37:01 +02:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Required packages:
|
|
|
|
|
pip3 install pdfplumber pytesseract pdf2image # pdf2image and pytesseract only needed if using --ocr
|
|
|
|
|
|
|
|
|
|
System dependencies (only if using --ocr):
|
|
|
|
|
brew install tesseract poppler # on macOS
|
|
|
|
|
# or
|
|
|
|
|
sudo apt-get install tesseract-ocr poppler-utils # on Ubuntu/Debian
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import re
|
|
|
|
|
import argparse
|
|
|
|
|
import logging
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
import tempfile
|
|
|
|
|
import subprocess
|
|
|
|
|
import pdfplumber
|
|
|
|
|
|
|
|
|
|
def check_dependencies(ocr_enabled):
|
|
|
|
|
try:
|
|
|
|
|
if ocr_enabled:
|
|
|
|
|
import pytesseract
|
|
|
|
|
from pdf2image import convert_from_path
|
|
|
|
|
except ImportError as e:
|
|
|
|
|
print(f"Missing dependency: {e}")
|
|
|
|
|
print("Please install required packages:")
|
|
|
|
|
if ocr_enabled:
|
|
|
|
|
print("pip3 install pytesseract pdf2image")
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import re
|
|
|
|
|
import argparse
|
|
|
|
|
import logging
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
import tempfile
|
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
|
|
def setup_logging(log_level):
|
|
|
|
|
"""Configure logging with the specified level."""
|
|
|
|
|
numeric_level = getattr(logging, log_level.upper(), None)
|
|
|
|
|
if not isinstance(numeric_level, int):
|
|
|
|
|
raise ValueError(f'Invalid log level: {log_level}')
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
level=numeric_level,
|
|
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
|
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def build_regex_pattern(prefix, num_digits):
|
|
|
|
|
"""Build regex pattern based on prefix and number of digits."""
|
|
|
|
|
# Escape any special regex characters in the prefix
|
|
|
|
|
escaped_prefix = re.escape(prefix)
|
|
|
|
|
# Pattern matches the prefix followed by exactly num_digits digits
|
|
|
|
|
# and ensures no digits or letters follow
|
|
|
|
|
pattern = f"{escaped_prefix}\\d{{{num_digits}}}(?![\\d\\w])"
|
|
|
|
|
logging.debug(f"Generated regex pattern: {pattern}")
|
|
|
|
|
return pattern
|
|
|
|
|
|
|
|
|
|
def set_finder_comment(file_path, comment):
|
|
|
|
|
"""Set the Finder comment for a file using osascript."""
|
|
|
|
|
try:
|
|
|
|
|
# Escape special characters in both the file path and comment
|
|
|
|
|
escaped_path = str(file_path).replace('"', '\\"').replace("'", "'\\''")
|
|
|
|
|
escaped_comment = comment.replace('"', '\\"').replace("'", "'\\''")
|
|
|
|
|
|
|
|
|
|
script = f'''
|
|
|
|
|
osascript -e 'tell application "Finder"
|
|
|
|
|
set commentPath to POSIX file "{escaped_path}" as alias
|
|
|
|
|
set comment of commentPath to "{escaped_comment}"
|
|
|
|
|
end tell'
|
|
|
|
|
'''
|
|
|
|
|
subprocess.run(script, shell=True, check=True, stderr=subprocess.PIPE)
|
|
|
|
|
logging.debug(f"Set Finder comment for {file_path} to: {comment}")
|
|
|
|
|
return True
|
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
|
|
logging.error(f"Failed to set Finder comment for {file_path}: {e.stderr.decode()}")
|
|
|
|
|
return False
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"Failed to set Finder comment for {file_path}: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def rename_with_bates(file_path, name_prefix, first_num, last_num):
|
|
|
|
|
"""Rename file using Bates numbers and preserve original name in metadata."""
|
|
|
|
|
try:
|
|
|
|
|
path = Path(file_path)
|
|
|
|
|
original_name = path.name
|
|
|
|
|
new_name = f"{name_prefix}{first_num}–{last_num}{path.suffix}"
|
|
|
|
|
new_path = path.parent / new_name
|
|
|
|
|
|
|
|
|
|
# First try to set the metadata
|
|
|
|
|
if not set_finder_comment(file_path, original_name):
|
|
|
|
|
logging.error(f"Skipping rename of {file_path} due to metadata failure")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# Then rename the file
|
|
|
|
|
path.rename(new_path)
|
|
|
|
|
logging.info(f"Renamed {original_name} to {new_name}")
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"Failed to rename {file_path}: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def ocr_page(pdf_path, page_num):
|
|
|
|
|
"""OCR a specific page of a PDF."""
|
|
|
|
|
filename = Path(pdf_path).name
|
|
|
|
|
logging.debug(f"[{filename}] Running OCR on page {page_num}")
|
|
|
|
|
try:
|
|
|
|
|
# Import OCR-related modules only when needed
|
|
|
|
|
import pytesseract
|
|
|
|
|
from pdf2image import convert_from_path
|
|
|
|
|
|
|
|
|
|
# Convert specific page to image
|
|
|
|
|
images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
|
|
|
|
|
if not images:
|
|
|
|
|
logging.error(f"[{filename}] Failed to convert page {page_num} to image")
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
# OCR the image
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.png') as tmp:
|
|
|
|
|
images[0].save(tmp.name, 'PNG')
|
|
|
|
|
text = pytesseract.image_to_string(tmp.name)
|
|
|
|
|
logging.debug(f"[{filename}] Page {page_num} OCR result: '{text}'")
|
|
|
|
|
return text
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"[{filename}] OCR failed for page {page_num}: {str(e)}")
|
|
|
|
|
return ""
|
|
|
|
|
|
2024-10-26 22:45:39 +02:00
|
|
|
|
def extract_text_from_page_multilayer(page, pdf_path, page_num):
|
|
|
|
|
"""Extract text from different PDF layers."""
|
2024-10-26 20:37:01 +02:00
|
|
|
|
filename = Path(pdf_path).name
|
|
|
|
|
# Get page dimensions
|
|
|
|
|
width = page.width
|
|
|
|
|
height = page.height
|
|
|
|
|
|
|
|
|
|
# Calculate crop box for bottom fifth of page
|
2024-10-26 22:45:39 +02:00
|
|
|
|
padding = 2
|
2024-10-26 20:37:01 +02:00
|
|
|
|
y0 = max(0, min(height * 0.8, height - padding))
|
|
|
|
|
y1 = max(y0 + padding, min(height, height))
|
|
|
|
|
x0 = padding
|
|
|
|
|
x1 = max(x0 + padding, min(width - padding, width))
|
|
|
|
|
|
2024-10-26 22:45:39 +02:00
|
|
|
|
crop_box = (x0, y0, x1, y1)
|
2024-10-26 20:37:01 +02:00
|
|
|
|
|
2024-10-26 22:45:39 +02:00
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Dimensions {width}x{height}, crop box: ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
|
|
|
|
|
|
|
|
|
|
texts = []
|
|
|
|
|
|
|
|
|
|
# Method 1: Try regular text extraction
|
2024-10-26 20:37:01 +02:00
|
|
|
|
try:
|
2024-10-26 22:45:39 +02:00
|
|
|
|
text = page.crop(crop_box).extract_text()
|
|
|
|
|
if text:
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Regular extraction found: '{text}'")
|
|
|
|
|
texts.append(text)
|
2024-10-26 20:37:01 +02:00
|
|
|
|
except Exception as e:
|
2024-10-26 22:45:39 +02:00
|
|
|
|
logging.debug(f"[{filename}] Page {page_num}: Regular text extraction failed: {e}")
|
2024-10-26 20:37:01 +02:00
|
|
|
|
|
2024-10-26 22:45:39 +02:00
|
|
|
|
# Method 2: Try extracting words individually
|
|
|
|
|
try:
|
|
|
|
|
words = page.crop(crop_box).extract_words()
|
|
|
|
|
if words:
|
|
|
|
|
text = ' '.join(word['text'] for word in words)
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Word extraction found: '{text}'")
|
|
|
|
|
texts.append(text)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.debug(f"[{filename}] Page {page_num}: Word extraction failed: {e}")
|
2024-10-26 20:37:01 +02:00
|
|
|
|
|
2024-10-26 22:45:39 +02:00
|
|
|
|
# Method 3: Try extracting characters individually
|
2024-10-26 20:37:01 +02:00
|
|
|
|
try:
|
2024-10-26 22:45:39 +02:00
|
|
|
|
chars = page.crop(crop_box).chars
|
|
|
|
|
if chars:
|
|
|
|
|
text = ''.join(char['text'] for char in chars)
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Character extraction found: '{text}'")
|
|
|
|
|
texts.append(text)
|
2024-10-26 20:37:01 +02:00
|
|
|
|
except Exception as e:
|
2024-10-26 22:45:39 +02:00
|
|
|
|
logging.debug(f"[{filename}] Page {page_num}: Character extraction failed: {e}")
|
|
|
|
|
|
|
|
|
|
# Method 4: Try extracting annotations
|
|
|
|
|
try:
|
|
|
|
|
annots = page.annots
|
|
|
|
|
if annots and isinstance(annots, list): # Fix for the error
|
|
|
|
|
for annot in annots:
|
|
|
|
|
if isinstance(annot, dict) and 'contents' in annot:
|
|
|
|
|
text = annot['contents']
|
|
|
|
|
if text and not isinstance(text, str):
|
|
|
|
|
text = str(text)
|
|
|
|
|
if text and text.lower() != 'none':
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Annotation found: '{text}'")
|
|
|
|
|
texts.append(text)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.debug(f"[{filename}] Page {page_num}: Annotation extraction failed: {e}")
|
|
|
|
|
|
|
|
|
|
# Method 5: Try extracting text in reverse order
|
|
|
|
|
try:
|
|
|
|
|
chars = sorted(page.crop(crop_box).chars, key=lambda x: (-x['top'], x['x0']))
|
|
|
|
|
if chars:
|
|
|
|
|
text = ''.join(char['text'] for char in chars)
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Reverse order extraction found: '{text}'")
|
|
|
|
|
texts.append(text)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.debug(f"[{filename}] Page {page_num}: Reverse order extraction failed: {e}")
|
|
|
|
|
|
|
|
|
|
# Method 6: Last resort - flatten and OCR the crop box
|
|
|
|
|
if not texts:
|
|
|
|
|
try:
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
|
|
|
|
|
# Import needed only if we get this far
|
|
|
|
|
from pdf2image import convert_from_bytes
|
|
|
|
|
import pytesseract
|
|
|
|
|
|
|
|
|
|
# Convert just this page to image
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.pdf') as tmp_pdf:
|
|
|
|
|
# Save just this page to a temporary PDF
|
|
|
|
|
writer = pdfplumber.PDF(page.page_obj)
|
|
|
|
|
writer.save(tmp_pdf.name)
|
|
|
|
|
|
|
|
|
|
# Convert to image
|
|
|
|
|
images = convert_from_bytes(open(tmp_pdf.name, 'rb').read())
|
|
|
|
|
if images:
|
|
|
|
|
# Crop the image to our area of interest
|
|
|
|
|
img = images[0]
|
|
|
|
|
img_width, img_height = img.size
|
|
|
|
|
crop_box_pixels = (
|
|
|
|
|
int(x0 * img_width / width),
|
|
|
|
|
int(y0 * img_height / height),
|
|
|
|
|
int(x1 * img_width / width),
|
|
|
|
|
int(y1 * img_height / height)
|
|
|
|
|
)
|
|
|
|
|
cropped = img.crop(crop_box_pixels)
|
|
|
|
|
|
|
|
|
|
# OCR the cropped area
|
|
|
|
|
text = pytesseract.image_to_string(cropped)
|
|
|
|
|
if text:
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
|
|
|
|
|
texts.append(text)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.debug(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
|
|
|
|
|
|
|
|
|
|
return texts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_bates_number(texts, pattern):
|
|
|
|
|
"""Try to find Bates number in multiple text layers."""
|
|
|
|
|
for text in texts:
|
|
|
|
|
matches = list(re.finditer(pattern, text))
|
|
|
|
|
if matches:
|
|
|
|
|
return matches[-1] # Return last match if found
|
|
|
|
|
return None
|
2024-10-26 20:37:01 +02:00
|
|
|
|
|
|
|
|
|
def extract_bates_numbers(pdf_path, pattern, use_ocr):
|
|
|
|
|
"""Extract Bates numbers from first and last page of PDF using provided pattern."""
|
|
|
|
|
filename = Path(pdf_path).name
|
|
|
|
|
logging.info(f"[{filename}] Processing PDF")
|
|
|
|
|
try:
|
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
|
|
|
first_page = pdf.pages[0]
|
|
|
|
|
last_page = pdf.pages[-1]
|
2024-10-26 22:45:39 +02:00
|
|
|
|
|
|
|
|
|
# Try all PDF layers first
|
|
|
|
|
first_texts = extract_text_from_page_multilayer(first_page, pdf_path, 0)
|
|
|
|
|
last_texts = extract_text_from_page_multilayer(last_page, pdf_path, len(pdf.pages)-1)
|
|
|
|
|
|
|
|
|
|
first_match = find_bates_number(first_texts, pattern)
|
|
|
|
|
last_match = find_bates_number(last_texts, pattern)
|
|
|
|
|
|
|
|
|
|
# If no matches found, try flatten and OCR
|
|
|
|
|
if not first_match or not last_match:
|
|
|
|
|
logging.info(f"[{filename}] No matches in text layers, attempting flatten/OCR")
|
|
|
|
|
|
|
|
|
|
# For first page
|
|
|
|
|
if not first_match:
|
|
|
|
|
try:
|
|
|
|
|
flattened_text = flatten_and_ocr_page(first_page, pdf_path, 0)
|
|
|
|
|
if flattened_text:
|
|
|
|
|
first_texts.append(flattened_text)
|
|
|
|
|
matches = list(re.finditer(pattern, flattened_text))
|
|
|
|
|
if matches:
|
|
|
|
|
first_match = matches[-1]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"[{filename}] Flatten/OCR failed for first page: {e}")
|
|
|
|
|
|
|
|
|
|
# For last page
|
|
|
|
|
if not last_match:
|
|
|
|
|
try:
|
|
|
|
|
flattened_text = flatten_and_ocr_page(last_page, pdf_path, len(pdf.pages)-1)
|
|
|
|
|
if flattened_text:
|
|
|
|
|
last_texts.append(flattened_text)
|
|
|
|
|
matches = list(re.finditer(pattern, flattened_text))
|
|
|
|
|
if matches:
|
|
|
|
|
last_match = matches[-1]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"[{filename}] Flatten/OCR failed for last page: {e}")
|
|
|
|
|
|
2024-10-26 20:37:01 +02:00
|
|
|
|
if first_match and last_match:
|
|
|
|
|
first_num = ''.join(filter(str.isdigit, first_match.group(0)))
|
|
|
|
|
last_num = ''.join(filter(str.isdigit, last_match.group(0)))
|
2024-10-26 22:45:39 +02:00
|
|
|
|
|
2024-10-26 20:37:01 +02:00
|
|
|
|
logging.info(f"[{filename}] Found numbers: {first_num}–{last_num}")
|
|
|
|
|
return (first_num, last_num)
|
|
|
|
|
else:
|
|
|
|
|
logging.warning(f"[{filename}] No matching numbers found")
|
|
|
|
|
return None
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"[{filename}] Error processing PDF: {str(e)}")
|
|
|
|
|
return None
|
|
|
|
|
|
2024-10-26 22:45:39 +02:00
|
|
|
|
def flatten_and_ocr_page(page, pdf_path, page_num):
|
|
|
|
|
"""Flatten page and OCR the crop box area."""
|
|
|
|
|
filename = Path(pdf_path).name
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Attempting flatten and OCR")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Import needed only if we get this far
|
|
|
|
|
from pdf2image import convert_from_path
|
|
|
|
|
import pytesseract
|
|
|
|
|
import PyPDF2
|
|
|
|
|
|
|
|
|
|
# Get page dimensions
|
|
|
|
|
width = page.width
|
|
|
|
|
height = page.height
|
|
|
|
|
|
|
|
|
|
# Calculate crop box for bottom fifth
|
|
|
|
|
padding = 2
|
|
|
|
|
y0 = max(0, min(height * 0.8, height - padding))
|
|
|
|
|
y1 = max(y0 + padding, min(height, height))
|
|
|
|
|
x0 = padding
|
|
|
|
|
x1 = max(x0 + padding, min(width - padding, width))
|
|
|
|
|
|
|
|
|
|
# Create a single-page PDF with just this page
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
|
|
|
|
|
pdf_writer = PyPDF2.PdfWriter()
|
|
|
|
|
with open(pdf_path, 'rb') as pdf_file:
|
|
|
|
|
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
|
|
|
|
pdf_writer.add_page(pdf_reader.pages[page_num])
|
|
|
|
|
pdf_writer.write(tmp_pdf)
|
|
|
|
|
tmp_pdf.flush()
|
|
|
|
|
|
|
|
|
|
# Convert to image
|
|
|
|
|
images = convert_from_path(tmp_pdf.name)
|
|
|
|
|
if images:
|
|
|
|
|
# Crop the image to our area of interest
|
|
|
|
|
img = images[0]
|
|
|
|
|
img_width, img_height = img.size
|
|
|
|
|
crop_box_pixels = (
|
|
|
|
|
int(x0 * img_width / width),
|
|
|
|
|
int(y0 * img_height / height),
|
|
|
|
|
int(x1 * img_width / width),
|
|
|
|
|
int(y1 * img_height / height)
|
|
|
|
|
)
|
|
|
|
|
cropped = img.crop(crop_box_pixels)
|
|
|
|
|
|
|
|
|
|
# OCR the cropped area
|
|
|
|
|
text = pytesseract.image_to_string(cropped)
|
|
|
|
|
if text:
|
|
|
|
|
logging.info(f"[{filename}] Page {page_num}: Flatten/OCR found: '{text}'")
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
# Clean up the temporary file
|
|
|
|
|
os.unlink(tmp_pdf.name)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"[{filename}] Page {page_num}: Flatten/OCR failed: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
2024-10-26 20:37:01 +02:00
|
|
|
|
def process_folder(folder_path, pattern, use_ocr, dry_run=False, name_prefix=None):
|
|
|
|
|
"""Process all PDFs in the specified folder."""
|
|
|
|
|
folder = Path(folder_path)
|
|
|
|
|
if not folder.exists():
|
|
|
|
|
logging.error(f"Folder does not exist: {folder_path}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
logging.info(f"Processing folder: {folder_path}")
|
|
|
|
|
|
|
|
|
|
pdf_count = 0
|
|
|
|
|
success_count = 0
|
|
|
|
|
rename_count = 0
|
|
|
|
|
|
2024-10-26 22:45:39 +02:00
|
|
|
|
# Use simple case-insensitive matching
|
|
|
|
|
pdf_files = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == '.pdf']
|
|
|
|
|
|
|
|
|
|
for pdf_file in pdf_files:
|
2024-10-26 20:37:01 +02:00
|
|
|
|
pdf_count += 1
|
|
|
|
|
numbers = extract_bates_numbers(pdf_file, pattern, use_ocr)
|
|
|
|
|
if numbers:
|
|
|
|
|
success_count += 1
|
|
|
|
|
if dry_run:
|
|
|
|
|
print(f"{pdf_file.name}: {numbers[0]}–{numbers[1]}")
|
|
|
|
|
elif name_prefix is not None:
|
|
|
|
|
if rename_with_bates(pdf_file, name_prefix, numbers[0], numbers[1]):
|
|
|
|
|
rename_count += 1
|
|
|
|
|
|
|
|
|
|
logging.info(f"Processed {pdf_count} PDFs, successfully extracted {success_count} number pairs")
|
|
|
|
|
if not dry_run and name_prefix is not None:
|
|
|
|
|
logging.info(f"Renamed {rename_count} files")
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
parser = argparse.ArgumentParser(description='Extract Bates numbers from PDFs')
|
|
|
|
|
parser.add_argument('folder', help='Path to folder containing PDFs')
|
|
|
|
|
parser.add_argument('--log', default='INFO',
|
|
|
|
|
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
|
|
|
|
help='Set the logging level')
|
|
|
|
|
parser.add_argument('--width-start', type=float, default=0.67,
|
|
|
|
|
help='Relative x-coordinate to start crop (0-1)')
|
|
|
|
|
parser.add_argument('--height-start', type=float, default=0.83,
|
|
|
|
|
help='Relative y-coordinate to start crop (0-1)')
|
|
|
|
|
parser.add_argument('--prefix', type=str, default='FWS-',
|
|
|
|
|
help='Prefix pattern to search for (default: "FWS-")')
|
|
|
|
|
parser.add_argument('--digits', type=int, default=6,
|
|
|
|
|
help='Number of digits to match after prefix (default: 6)')
|
|
|
|
|
parser.add_argument('--ocr', action='store_true',
|
|
|
|
|
help='Enable OCR for pages with little or no text (disabled by default)')
|
|
|
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
|
|
|
help='Only print matches without renaming files')
|
|
|
|
|
parser.add_argument('--name-prefix', type=str,
|
|
|
|
|
help='Prefix to use when renaming files (e.g., "FWS ")')
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
setup_logging(args.log)
|
|
|
|
|
|
|
|
|
|
# Check dependencies based on whether OCR is enabled
|
|
|
|
|
check_dependencies(args.ocr)
|
|
|
|
|
|
|
|
|
|
# Display the pattern we're looking for
|
|
|
|
|
display_pattern = f"{args.prefix}{'#' * args.digits}"
|
|
|
|
|
print(f"Looking for pattern: {display_pattern}")
|
|
|
|
|
|
|
|
|
|
if not args.dry_run and args.name_prefix is None:
|
|
|
|
|
logging.error("Must specify --name-prefix when not in dry-run mode")
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
pattern = build_regex_pattern(args.prefix, args.digits)
|
|
|
|
|
process_folder(args.folder, pattern, args.ocr, args.dry_run, args.name_prefix)
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|
|
|
|
|
|