python多线程实现查找目录下有没有相同的文件,列出哈希值相同的文件
import os
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
# 计算文件的 MD5 哈希值
def calculate_file_hash(file_path):
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
while chunk := f.read(8192): # 分块读取文件
hasher.update(chunk)
return hasher.hexdigest()
def main():
directory = r"D:\wwwroot\font" # 替换为你的目录路径
# 存储文件哈希值的字典
file_hashes = defaultdict(list)
try:
# 获取目录下的所有文件
files = []
for root, _, filenames in os.walk(directory):
for filename in filenames:
files.append(os.path.join(root, filename))
# 使用 ThreadPoolExecutor 并发处理文件
with ThreadPoolExecutor() as executor:
future_to_file = {
executor.submit(calculate_file_hash, file): file
for file in files
}
for future in as_completed(future_to_file):
file = future_to_file[future]
try:
hash_value = future.result()
file_hashes[hash_value].append(file)
except Exception as e:
print(f"计算文件哈希失败: {file} - {e}")
# 输出相同的文件
for hash_value, file_list in file_hashes.items():
if len(file_list) > 1:
print(f"相同的文件 (哈希值: {hash_value}):")
for file in file_list:
print(f" - {file}")
except Exception as e:
print(f"错误: {e}")
if __name__ == "__main__":
main()
运行结果
D:\code\python\samefile>python samefile.py
相同的文件 (哈希值: 188165ae27047cdf248e5fb68bf3b5b6):
- D:\wwwroot\font\2.ttf
- D:\wwwroot\font\STHUPO.TTF