import os
import subprocess
import csv
import sys
import argparse


def get_files(base_dir):
    """Get all files relative to base_dir, normalized to lowercase."""
    files = set()
    for root, dirs, filenames in os.walk(base_dir):
        for f in filenames:
            full = os.path.join(root, f)
            rel = os.path.relpath(full, base_dir)
            files.add(rel.lower())
    return files


def list_archive_contents(archive_path):
    """List contents of an archive without extracting."""
    if archive_path.lower().endswith('.zip'):
        try:
            result = subprocess.run(
                ["unzip", "-l", archive_path],
                capture_output=True, text=True, timeout=30
            )
            files = []
            for line in result.stdout.split('\n'):
                parts = line.split()
                if len(parts) >= 4:
                    fname = parts[-1]
                    if not fname.endswith('/') and fname != 'Name' and '---' not in line:
                        files.append(fname.lower())
            return files
        except Exception as e:
            print(f"Error reading zip {archive_path}: {e}", file=sys.stderr)
            return []
    elif archive_path.lower().endswith('.tgz') or archive_path.lower().endswith('.tar.gz'):
        try:
            result = subprocess.run(
                ["tar", "-tzf", archive_path],
                capture_output=True, text=True, timeout=30
            )
            files = []
            for line in result.stdout.split('\n'):
                line = line.strip()
                if line and not line.endswith('/'):
                    files.append(line.lower())
            return files
        except Exception as e:
            print(f"Error reading tgz {archive_path}: {e}", file=sys.stderr)
            return []
    return []


def normalize_path(p):
    """Normalize path for comparison."""
    return p.lower().replace('\\', '/')


def main():
    parser = argparse.ArgumentParser(
        description="Compare files between a source directory and a destination directory."
    )
    parser.add_argument(
        "source",
        help="Path to the source (local) directory",
        )
    parser.add_argument(
        "destination",
        help="Path to the destination (external) directory",
        )
    parser.add_argument(
        "-o", "--output",
        default="gnss_file_list_compare.csv",
        help="Output CSV file path (default: file_comparison.csv)",
    )
    args = parser.parse_args()

    source_dir = os.path.abspath(args.source)
    dest_dir = os.path.abspath(args.destination)
    csv_path = args.output

    if not os.path.isdir(source_dir):
        print(f"Error: Source directory '{source_dir}' does not exist.", file=sys.stderr)
        sys.exit(1)
    if not os.path.isdir(dest_dir):
        print(f"Error: Destination directory '{dest_dir}' does not exist.", file=sys.stderr)
        sys.exit(1)

    print(f"Source: {source_dir}")
    print(f"Destination: {dest_dir}")
    print(f"Output: {csv_path}")
    print()

    print("Step 1: Getting source files...")
    source_files = get_files(source_dir)
    print(f"  Found {len(source_files)} source files")

    print("Step 2: Getting destination files (including archive contents)...")
    dest_direct_files = set()
    all_dest_files = set()
    archive_count = 0

    for root, dirs, filenames in os.walk(dest_dir):
        for f in filenames:
            full = os.path.join(root, f)
            rel = os.path.relpath(full, dest_dir)

            if (f.lower().endswith('.zip') or
                f.lower().endswith('.tgz') or
                f.lower().endswith('.tar.gz')):
                archive_count += 1
                contents = list_archive_contents(full)

                archive_dir = os.path.dirname(rel)
                archive_name = f
                if archive_name.lower().endswith('.tar.gz'):
                    archive_name = archive_name[:-7]
                elif archive_name.lower().endswith('.tgz'):
                    archive_name = archive_name[:-4]
                elif archive_name.lower().endswith('.zip'):
                    archive_name = archive_name[:-4]

                for c in contents:
                    if archive_dir:
                        path1 = normalize_path(os.path.join(archive_dir, c))
                    else:
                        path1 = normalize_path(c)
                    all_dest_files.add(path1)

                    path2 = normalize_path(os.path.join(archive_name, c))
                    all_dest_files.add(path2)
            else:
                dest_direct_files.add(rel.lower())
                all_dest_files.add(rel.lower())

    print(f"  Direct destination files: {len(dest_direct_files)}")
    print(f"  Archives found: {archive_count}")
    print(f"  Total destination files (including archive contents): {len(all_dest_files)}")

    print("Step 3: Comparing...")
    missing_in_dest = source_files - all_dest_files
    present_in_both = source_files & all_dest_files

    print(f"  Files in source: {len(source_files)}")
    print(f"  Files in destination (total): {len(all_dest_files)}")
    print(f"  Files in both: {len(present_in_both)}")
    print(f"  Files missing in destination: {len(missing_in_dest)}")

    print("Step 4: Writing CSV...")
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['file_name', 'source_disk', 'destination_disk'])

        all_files = sorted(source_files | all_dest_files)

        for f in all_files:
            in_source = "YES" if f in source_files else "NO"
            in_dest = "YES" if f in all_dest_files else "NO"
            writer.writerow([f, in_source, in_dest])

    print(f"CSV saved to {csv_path}")
    print(f"\nSummary:")
    print(f"  Total unique files: {len(all_files)}")
    print(f"  Missing in destination: {len(missing_in_dest)}")

    if missing_in_dest:
        print("\nFirst 30 missing files:")
        for f in sorted(list(missing_in_dest))[:30]:
            print(f"  {f}")


if __name__ == "__main__":
    main()
