averainy's Blog

averainy

27 Feb 2022

使用python删除重复文件

原因

群晖系统更新到 dsm7 之后,图片管理给切换到了 synology photos。结果就出现了没办法同步备份的问题,想解决问题只能将图片上传到共享空间,个人空间没办法自动同步备份。 为了解决这个问题,只好讲已有的文件挪到共享空间。同时又重新将 iCloud 上的图片同步了一遍。这样搞完之后就出现了大量的重复文件。鉴于没找到靠谱的去重工具,只好自己用python来做一个。

代码

# -*- coding:utf-8 -*-
  
from asyncio import ALL_COMPLETED
import os
import hashlib
import time
import sys
import shutil
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED

urllist = []
#搞到文件的MD5
def get_ms5(filename):
    m = hashlib.md5()
    mfile = open(filename , "rb")
    m.update(mfile.read())
    mfile.close
    md5_value = m.hexdigest()
    #print("{0}:{1}".format(md5_value,filename))
    return (md5_value,filename)
#搞到文件的列表
def get_urllist(base_dir):
    list = os.listdir(base_dir)
    for i in list:
        url = os.path.join(base_dir,i)
        if os.path.isfile(url):
            urllist.append(url)
        else:
            get_urllist(url)
#主函数
if __name__ == '__main__':
    print("test1")
    md5list = {}
    base_dir=('D:\\nas\\photo\\SynologyDrive\\MobileBackup')
    get_urllist(base_dir)
    pool  = ThreadPoolExecutor(max_workers=20)
    all_tasks = [pool.submit(get_ms5,(i)) for i in urllist]
    wait(all_tasks,return_when=ALL_COMPLETED)
    for task in all_tasks:
        md5,filename = task.result()
        if md5 in md5list:
            md5list[md5].append(filename)
            print("重复:%s" % filename)
        else:
            md5list[md5]=[filename]
    dst_dir = r"D:\chongfu"
    with open(r'D:\chongfu.txt','w+') as fp:
        for key in md5list:
            value = md5list[key]
            if len(value) > 1:
                new_dir = os.path.join(dst_dir,key)
                if os.path.exists(new_dir) is False:
                    os.mkdir(new_dir)
                for f in value[1:]:
                    new_file = os.path.join(new_dir,os.path.basename(f))
                    if(os.path.exists(new_file)):
                        os.remove(f)
                        continue
                    shutil.move(f,new_dir)
                fp.write(str(value)+"\r\n")

    print("end")

Categories