python多线程实现查找目录下有没有相同哈希值的文件

python多线程实现查找目录下有没有相同的文件,列出哈希值相同的文件

import os
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict

# 计算文件的 MD5 哈希值
def calculate_file_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        while chunk := f.read(8192):  # 分块读取文件
            hasher.update(chunk)
    return hasher.hexdigest()

def main():
    directory = r"D:\wwwroot\font"  # 替换为你的目录路径

    # 存储文件哈希值的字典
    file_hashes = defaultdict(list)

    try:
        # 获取目录下的所有文件
        files = []
        for root, _, filenames in os.walk(directory):
            for filename in filenames:
                files.append(os.path.join(root, filename))

        # 使用 ThreadPoolExecutor 并发处理文件
        with ThreadPoolExecutor() as executor:
            future_to_file = {
                executor.submit(calculate_file_hash, file): file
                for file in files
            }

            for future in as_completed(future_to_file):
                file = future_to_file[future]
                try:
                    hash_value = future.result()
                    file_hashes[hash_value].append(file)
                except Exception as e:
                    print(f"计算文件哈希失败: {file} - {e}")

        # 输出相同的文件
        for hash_value, file_list in file_hashes.items():
            if len(file_list) > 1:
                print(f"相同的文件 (哈希值: {hash_value}):")
                for file in file_list:
                    print(f"  - {file}")
    except Exception as e:
        print(f"错误: {e}")

if __name__ == "__main__":
    main()

运行结果

D:\code\python\samefile>python samefile.py
相同的文件 (哈希值: 188165ae27047cdf248e5fb68bf3b5b6):
  - D:\wwwroot\font\2.ttf
  - D:\wwwroot\font\STHUPO.TTF
原文链接:,转发请注明来源!