-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
89 lines (71 loc) · 3.02 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# The premise behind this program is that I often misplace a file, redownload it, and then move it into the same folder where the old one was originally
# Therefore, this program searches a given directory for filenames that are identical except for a small bit at the end, so that I can delete these duplicates en masse instead of manually doing it
# E.g. "Filename.jpg" vs "Filename (Copy).jpg"
# Before just blindly deleting files with similar names, the program makes sure the contents are identical
# I don't want to upload a file that will mass-delete files, so I have made it only print the duplicates at the moment
# If you run it and find that the duplicates listed are actually duplicates, then you can change it to delete them
import os
import sys
import filecmp
from pathlib import Path
def count_same_beginning_length(a, b):
count = 0
for i in range(min(len(a), len(b))):
if a[i] == b[i]:
count += 1
else:
break
return count
def determine_near_duplicates(list, similarity_factor):
output_list = []
# The list is sorted alphabetically, so filenames that start the same way will be right next to each other
i = 0
while i < len(list) - 1:
entry = list[i]
entry_name, entry_ext = os.path.splitext(entry)
duplicates = []
j = i + 1
last_duplicate_found = i
while j < len(list):
next_entry = list[j]
next_entry_name, next_entry_ext = os.path.splitext(next_entry)
# To be a duplicate, the extensions have to be the same
if entry_ext != next_entry_ext:
j = j + 1
continue
similarity = count_same_beginning_length(entry_name, next_entry_name)
smallest_length = min(len(entry_name), len(next_entry_name))
is_duplicate = similarity > (smallest_length * similarity_factor)
if is_duplicate:
# If e.g. there are two files named ImageA (0).jpg and ImageA.jpg, make sure that the longer one is labelled as a duplicate
if len(next_entry) < len(entry):
entry, next_entry = next_entry, entry
duplicates.append(next_entry)
last_duplicate_found = j
j = j + 1
else:
break
if len(duplicates) > 0:
output_list.append((entry, duplicates))
i = last_duplicate_found + 1
return output_list
if __name__ == "__main__":
# If the first X% of two filenames is identical, then treat them as a potential duplicate
# 70% seems reasonable at this point
# This is just to filter the amount of comparisons we have to do, we still check file contents to make sure they're the same later
similarity_factor = 0.7
folder = Path(sys.argv[1])
list = os.listdir(folder)
list = determine_near_duplicates(list, similarity_factor)
for entry in list:
original = entry[0]
duplicates = entry[1]
for d in duplicates:
# Now we check whether the two files are the same
same = filecmp.cmp(folder / original, folder / d)
if same:
print("Found duplicates: ")
print(" Original: " + original)
print(" Duplicate: " + d)
# Uncomment this line at your own risk:
#os.remove(folder / d)