import os
import hashlib
from collections import defaultdict
import sys
import re

req_contains = None

if len(sys.argv) > 1:
    req_contains = sys.argv[1]

sizes = defaultdict(list)

n = 0

for (path, dirs, files) in os.walk('.', followlinks=True):
    for file in files:
        n += 1
        full_path = os.path.join(path, file)
        print(f"\r[{n}] Processing: {full_path}\x1b[0K", end='')
        sz = os.stat(full_path).st_size
        sizes[sz].append(full_path)

print()

def check_hashes(files):
    first_hash = None
    for file in files:
        with open(file, 'rb') as f:
            hash_ = hashlib.md5(f.read())
            if first_hash is None:
                first_hash = hash_.digest()
            else:
                if first_hash != hash_.digest():
                    return False

    return True

for (k, v) in sizes.items():
    if len(v) >= 2 and check_hashes(v):
        print(f"Found duplicates: {v}")
        if req_contains is not None:
            for n, f in enumerate(v):
                if req_contains in f:
                    canonical_id = n + 1
                    break
            print(f"Picking file that contains the required substring {req_contains}: {f}")
        else:
            while True:
                try:
                    print(f"Please pick a canonical file [1-{len(v)}]: ")
                    res = input()
                    canonical_id = int(res)
                    break
                except:
                    print(f"Invalid input, please enter a number from 1 to {len(v)}")

        canonical_file = v[canonical_id - 1]

        for f in v:
            if f != canonical_file:
                print(f"Unlinking {f}...")
                os.unlink(f)

