#!/usr/bin/env python3

import os
import re
import requests
import time
import pkg_resources

# List of Python built-in modules
BUILTIN_MODULES = {
    'abc', 'aifc', 'argparse', 'array', 'ast', 'asynchat', 'asyncio', 'asyncore', 'atexit',
    'audioop', 'base64', 'bdb', 'binascii', 'binhex', 'bisect', 'builtins', 'bz2', 'calendar',
    'cgi', 'cgitb', 'chunk', 'cmath', 'cmd', 'code', 'codecs', 'codeop', 'collections', 'colorsys',
    'compileall', 'concurrent', 'configparser', 'contextlib', 'copy', 'copyreg', 'crypt', 'csv',
    'ctypes', 'curses', 'dataclasses', 'datetime', 'dbm', 'decimal', 'difflib', 'dis', 'distutils',
    'doctest', 'dummy_threading', 'email', 'encodings', 'ensurepip', 'enum', 'errno', 'faulthandler',
    'fcntl', 'filecmp', 'fileinput', 'fnmatch', 'formatter', 'fractions', 'ftplib', 'functools', 
    'gc', 'getopt', 'getpass', 'gettext', 'glob', 'gzip', 'hashlib', 'heapq', 'hmac', 'html', 'http',
    'imaplib', 'imghdr', 'imp', 'importlib', 'inspect', 'io', 'ipaddress', 'itertools', 'json', 
    'keyword', 'lib2to3', 'linecache', 'locale', 'logging', 'lzma', 'mailbox', 'mailcap', 'marshal', 
    'math', 'mimetypes', 'modulefinder', 'multiprocessing', 'netrc', 'nntplib', 'numbers', 'operator',
    'optparse', 'os', 'ossaudiodev', 'parser', 'pathlib', 'pdb', 'pickle', 'pickletools', 'pipes', 
    'pkgutil', 'platform', 'plistlib', 'poplib', 'posix', 'pprint', 'profile', 'pstats', 'pty', 
    'pwd', 'py_compile', 'pyclbr', 'pydoc', 'queue', 'quopri', 'random', 're', 'readline', 
    'reprlib', 'resource', 'rlcompleter', 'runpy', 'sched', 'secrets', 'select', 'selectors', 'shelve',
    'shlex', 'shutil', 'signal', 'site', 'smtpd', 'smtplib', 'sndhdr', 'socket', 'socketserver', 
    'spwd', 'sqlite3', 'ssl', 'stat', 'statistics', 'string', 'stringprep', 'struct', 'subprocess',
    'sunau', 'symtable', 'sys', 'sysconfig', 'syslog', 'tabnanny', 'tarfile', 'telnetlib', 'tempfile',
    'termios', 'test', 'textwrap', 'threading', 'time', 'timeit', 'token', 'tokenize', 'trace', 
    'traceback', 'tracemalloc', 'tty', 'turtle', 'types', 'typing', 'unicodedata', 'unittest', 
    'urllib', 'uu', 'uuid', 'venv', 'warnings', 'wave', 'weakref', 'webbrowser', 'xdrlib', 'xml', 
    'xmlrpc', 'zipapp', 'zipfile', 'zipimport', 'zlib'
}

# Known corrections for PyPI package names
KNOWN_CORRECTIONS = {
    'dateutil': 'python-dateutil',
    'dotenv': 'python-dotenv',
    'docx': 'python-docx',
    'tesseract': 'pytesseract',
    'magic': 'python-magic',
    'multipart': 'python-multipart',
    'newspaper': 'newspaper3k',
    'srtm': 'elevation',
    'yaml': 'pyyaml',
    'zoneinfo': 'backports.zoneinfo'
}

# List of generic names to exclude
EXCLUDED_NAMES = {'models', 'data', 'convert', 'example', 'tests'}

def find_imports(root_dir):
    imports_by_file = {}
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.py'):
                filepath = os.path.join(dirpath, filename)
                with open(filepath, 'r') as file:
                    import_lines = []
                    for line in file:
                        line = line.strip()
                        if line.startswith(('import ', 'from ')) and not line.startswith('#'):
                            import_lines.append(line)
                    imports_by_file[filepath] = import_lines
    return imports_by_file

def process_import_lines(import_lines):
    processed_lines = set()  # Use a set to remove duplicates
    for line in import_lines:
        # Handle 'import xyz' and 'import abc, def, geh'
        if line.startswith('import '):
            modules = line.replace('import ', '').split(',')
            for mod in modules:
                mod = re.sub(r'\s+as\s+\w+', '', mod).split('.')[0].strip()
                if mod and not mod.isupper() and mod not in EXCLUDED_NAMES:
                    processed_lines.add(mod)
        # Handle 'from abc import def, geh'
        elif line.startswith('from '):
            mod = line.split(' ')[1].split('.')[0].strip()
            if mod and not mod.isupper() and mod not in EXCLUDED_NAMES:
                processed_lines.add(mod)
    return processed_lines

def check_pypi_availability(libraries):
    available = set()
    unavailable = set()
    for lib in libraries:
        if lib in BUILTIN_MODULES:  # Skip built-in modules
            continue
        corrected_lib = KNOWN_CORRECTIONS.get(lib, lib)
        try:
            if check_library_on_pypi(corrected_lib):
                available.add(corrected_lib)
            else:
                unavailable.add(corrected_lib)
        except requests.exceptions.RequestException:
            print(f"Warning: Unable to check {corrected_lib} on PyPI due to network error.")
            unavailable.add(corrected_lib)
    return available, unavailable

def check_library_on_pypi(library):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.get(f"https://pypi.org/pypi/{library}/json", timeout=5)
            return response.status_code == 200
        except requests.exceptions.RequestException:
            if attempt < max_retries - 1:
                time.sleep(1)  # Wait for 1 second before retrying
            else:
                raise

def save_to_requirements_file(available, output_file='requirements.txt'):
    existing_requirements = set()
    if os.path.isfile(output_file):
        with open(output_file, 'r') as file:
            existing_requirements = set(line.strip() for line in file)
    with open(output_file, 'a') as file:
        for pkg in sorted(available - existing_requirements):
            print(f"Adding to requirements.txt: {pkg}")
            file.write(pkg + '\n')

def save_to_missing_file(unavailable, output_file='missing-packages.txt'):
    existing_missing = set()
    if os.path.isfile(output_file):
        with open(output_file, 'r') as file:
            existing_missing = set(line.strip() for line in file)
    with open(output_file, 'a') as file:
        for pkg in sorted(unavailable - existing_missing):
            print(f"Adding to missing-packages.txt: {pkg}")
            file.write(pkg + '\n')

if __name__ == "__main__":
    root_dir = os.getcwd()  # Get the current working directory

    imports_by_file = find_imports(root_dir)
    for filepath, import_lines in imports_by_file.items():
        print(f"# Processing {filepath}")
        processed_lines = process_import_lines(import_lines)
        available, unavailable = check_pypi_availability(processed_lines)
        save_to_requirements_file(available)
        save_to_missing_file(unavailable)

    print(f"Processed import statements have been saved to requirements.txt and missing-packages.txt")