Python之fnmatch查找目录下重复的文件

注:部分内容来自书籍或者网络,如有侵权,请联系删除。

#!/usr/bin/python
# -*- coding:UTF-8 -*-
from __future__ import print_function
import hashlib
import sys
import os
import fnmatch

CHUNK_SIZE = 8192

def is_file_match(filename,patterns):
    for pattern in patterns:
        if fnmatch.fnmatch(filename,pattern):
            return True
    return False

def find_specific_files(path,patterns=['*'],exclude_dirs=[]):
    for dirpath,dirnames,filenames in os.walk(path):
        for filename in filenames:
            if is_file_match(filename,patterns):
                yield os.path.join(dirpath,filename)
        for d in exclude_dirs:
            if d in dirnames:
                dirnames.remove(d)

def get_chunk(filename):
    with open(filename,'rb') as f:
        while True:
            chunk = f.read(CHUNK_SIZE)
            if  not chunk:
                break
            else:
                yield chunk
                
def get_file_checksum(filename):
    h = hashlib.md5()
    for chunk in get_chunk(filename):
        h.update(chunk)
    return h.hexdigest()

def main():
    sys.argv.append("")
    directory = sys.argv[1]
    if not os.path.isdir(directory):
        raise SystemExit("{0} is not a directory".format(directory))
        
    record = {}
    for item in find_specific_files(directory):
        checksum = get_file_checksum(item)
        if checksum is record:
            print('find duplicate file: {0} vs {1}'.format(record[checksum],item))
        else:
            record[checksum] = item
            
if __name__ == '__main__':
    main()

实现效果

[root@Ansible Python]# python search.py /root/Python
find duplicate file: /root/Python/cmd1.py vs /root/Python/cmd2.py
find duplicate file: /root/Python/cmd1.py vs /root/Python/11.py


「 文章如果对你有帮助,请点个赞哦^^ 」 

0