删除内容重复的文件

脚本功能

指定一个目录，删除其中内容重复的文件。根据md5sum值判断文件是否重复，根据文件名长短判断保留文件名短的文件。根据文件数量判断处理进度。

脚本内容

文件名：cleanup_duplicates.sh

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/bash

# 设置默认文件夹路径为 /home/user/test
folder=${1:-"/home/user/test"}

# 统计文件夹下的文件总数
file_count=$(find "$folder" -maxdepth 1 -type f | wc -l)
processed_count=0

# 找出文件夹下所有文件，并计算每个文件的 MD5 值
find "$folder" -maxdepth 1 -type f -exec md5sum {} + | sort > "$folder/md5sum.txt"

# 根据 MD5 值进行重复文件的处理
while read -r line; do
    md5=$(echo "$line" | awk '{print $1}')
    filename=$(echo "$line" | awk '{print $2}')

    # 检查是否有重复的文件
    duplicates=$(grep "$md5" "$folder/md5sum.txt" | grep -v "$filename")

    if [[ -n $duplicates ]]; then
        shortest_filename=$filename

        # 查找文件名较短的重复文件
        while read -r duplicate; do
            duplicate_filename=$(echo "$duplicate" | awk '{print $2}')
            if [[ ${#duplicate_filename} -lt ${#shortest_filename} ]]; then
                shortest_filename=$duplicate_filename
            fi
        done <<< "$duplicates"

        # 删除除了文件名最短的重复文件之外的其他文件
        while read -r duplicate; do
            duplicate_filename=$(echo "$duplicate" | awk '{print $2}')
            if [[ "$duplicate_filename" != "$shortest_filename" ]]; then
                rm "$duplicate_filename"
            fi
        done <<< "$duplicates"
    fi

    # 更新进度
    processed_count=$((processed_count + 1))
    progress=$((processed_count * 100 / file_count))
    echo "Processing: $progress%"
done <<< "$(tail -n +2 "$folder/md5sum.txt")"  # 跳过第一行，不包括 md5sum.txt 文件本身

# 删除临时文件
rm "$folder/md5sum.txt"

使用方法

1
2
chmod +x cleanup_duplicates.sh
$ ./cleanup_duplicates.sh /home/cjx/test/test