python扫描全部文件放入数据库


上文 用memory_profiler 监控内存耗用写了一个遍历目录夹下全部文件的脚本,约有26万个文件,用时90秒左右。
如今想要将文件名、路径存入数据库, 要求有新增文件时,就写入数据,有减小的文件时,就从数据库删除。
python

1、 第一步,将文件写入数据库。

1. 上代码:

root@WJL-SH4031667: # cat setFileToSql.pysql

python hljs    32行数据库

from os import walk
from os.path import join,getmtime
import sqlite3

@profile
def scan():
    dbName,tblName = "test.db","fmanages"
    conn = sqlite3.connect(dbName)
    cursor = conn.cursor()
    cursor.execute('create table {}(id varchar(20) primary key, fname varchar(30), path varchar(80), tm FLOAT, tag varchar(80))'.format(tblName))
    ret = {}
    for path, _, files in walk("./"):
        for f in files:
            if not f.endswith(""):
                continue


            filename = join(path, f)

            try:
                cursor.execute("insert into {} (fname, path,tm, tag) values (\'{}\',\'{}\',\'{}\', \'study\')".format(tblName, filename.rstrip(), path.rstrip(), getmtime(filename)))
            except:
                continue

    cursor.close()
    conn.commit()
    conn.close()

if __name__ == "__main__":
    scan()

2. 测试运行用时和内存使用状况

root@WJL-SH4031667: # time python -m memory_profiler setFileToSql.py测试

python hljs    26行fetch

Line #    Mem usage    Increment   Line Contents
================================================
     5   29.820 MiB   29.820 MiB   @profile
     6                             def scan():
     7   29.824 MiB    0.004 MiB       dbName,tblName = "test.db","fmanages"
     8   29.984 MiB    0.160 MiB       conn = sqlite3.connect(dbName)
     9   29.988 MiB    0.004 MiB       cursor = conn.cursor()
    10   30.473 MiB    0.484 MiB       cursor.execute('create table {}(id varchar(20) primary key, fname varchar(30), path varchar(80), tm FLOAT, tag varchar(80))'.format(tblName))

    12   37.578 MiB -48094.824 MiB       for path, _, files in walk("./"):
    13   37.578 MiB -479957.762 MiB           for f in files:
    14   37.578 MiB -431869.512 MiB               if not f.endswith(""):
    15                                             continue

    20   37.578 MiB -431869.512 MiB               try:
    21   37.578 MiB -431869.469 MiB                   cursor.execute("insert into {} (fname, path,tm, tag) values (\'{}\',\'{}\',\'{}\', \'study\')".format(tblName, f.rstrip(), path.rstrip(), getmtime(f)))
    22   37.578 MiB -431727.805 MiB               except:
    23   37.578 MiB -431727.809 MiB                   continue
    24
    25   31.953 MiB   -5.625 MiB       cursor.close()
    26   31.953 MiB    0.000 MiB       conn.commit()
    27   31.953 MiB    0.000 MiB       conn.close()
    
python -m memory_profiler setFileToSql.py  94.52s user 253.09s system 99% cpu 5:49.69 total

3. 优化

从上看出,使用数据库用了内存30M左右,相比上文 用memory_profiler 监控内存耗用, 字典方式存储时用的100M,少了70M,字典果真是吃粮大货。想提升sqlite的写入速度?但和上文比对以后,二者时间差很少,优化的空间就不大了。user态用时90秒的时间,都用在了IO的耗时上。优化

4. 查看生成的数据库文件大小(23万72条记录,42M左右)

root@WJL-SH4031667:# ls -lh test.db
-rwxrwxrwx 1 root root 42M Nov 1 09:01 test.db.net

root@WJL-SH4031667: # sqlite3 test.db3d

sqlite3 hljs    3行code

sqlite> select count(*) from fmanages;
230072

2、第二步,增长和删除文件后,对应的数据库的记录可以相应的增删。

1. 主要是新增了两个集合的对比,第一个集合是前一次的数据库中全部文件名,第二个集合是现有的文件名集合。两个集合的相减就能得出增/减的文件:

python hljs    15行orm

#导出原有的文件记录
        extRet = dict(cursor.execute('select fname,tm from {}'.format(tblName)).fetchall())
        
    #将上次的文件名集合和这一次的文件名集合弄出来
    prekeys = extRet.keys()
    nowkeys = ret.keys()

    #检查删除的文件
    delFile = prekeys - nowkeys
    ...

    #检查新增的文件
    newFile = nowkeys - prekeys
    ...

2. 上代码

root@WJL-SH4031667: # cat setFileToSql.py

python hljs    67行

from os import walk
from os.path import join,getmtime
import sqlite3
 
@profile
def scan():
    dbName,tblName = "test.db","fmanages"
    conn = sqlite3.connect(dbName)
    cursor = conn.cursor()
    ret,extRet = {},{}
    

    i = 1
    for path, _, files in walk("./"):
        for f in files:
            if not f.endswith(""):
                continue


            filename = join(path, f)
            ret[filename] = getmtime(filename)
            i += 1
            
    ###############如下为新增代码,比较重要的是两个集合还删增############		
    #检查是否存在table
    try:
        cursor.execute('create table {}(id varchar(20) primary key, fname varchar(30), path varchar(80), tm FLOAT, tag varchar(80))'.format(tblName))
    except:
        extRet = dict(cursor.execute('select fname,tm from {}'.format(tblName)).fetchall())
        
    #将上次的文件名集合和这一次的文件名集合弄出来
    prekeys = extRet.keys()
    nowkeys = ret.keys()

    #检查删除的文件
    delFile = prekeys - nowkeys
    print("删除的文件{}".format(delFile))

    #检查新增的文件
    newFile = nowkeys - prekeys
    print("新增的文件{}".format(newFile))
    
    try:
        for filename in delFile:
            cursor.execute("delete from {} where fname=\'{}\'".format(tblName, filename))
            print("删除的文件{}".format(filename))
    except:
        pass


    try:
        for filename in newFile:
            cursor.execute("insert into {} (fname, path,tm, tag) values (\'{}\',\'new\',\'{}\', \'study\')".format(tblName, filename.rstrip(), getmtime(filename)))
            print("新增的文件{}".format(filename))
    except:
        pass
    ############################################

    cursor.close()
    conn.commit()
    conn.close()
    return ret


if __name__ == "__main__":
    scan()

3. 增长删减功能以后,观察内存使用状况(多了20M左右,总用时多了1分钟)

(首先去删除一个文件)

python hljs    61行

Filename: setFileToSql.py

Line #    Mem usage    Increment   Line Contents
================================================
     6   29.824 MiB   29.824 MiB   @profile
     7                             def scan():
     8   29.828 MiB    0.004 MiB       dbName,tblName = "test.db","fmanages"
     9   29.988 MiB    0.160 MiB       conn = sqlite3.connect(dbName)
    10   29.992 MiB    0.004 MiB       cursor = conn.cursor()
    11   29.992 MiB    0.000 MiB       ret,extRet = {},{}
    12                                 #\u68C0\u67E5\u662F\u5426\u5B58\u5728table
    13   29.992 MiB    0.000 MiB       try:
    14   30.477 MiB    0.484 MiB           cursor.execute('create table {}(id varchar(20) primary key, fname varchar(30), path varchar(80), tm FLOAT, tag varchar(80))'.format(tblName))
    15                                 except:
    16                                     extRet = dict(cursor.execute('select fname,tm from {}'.format(tblName)).fetchall())
    17
    18   30.477 MiB    0.000 MiB       i = 1
    19   97.188 MiB -5072.457 MiB       for path, _, files in walk("./"):
    20   97.188 MiB -35328.441 MiB           for f in files:
    21   97.188 MiB -31115.449 MiB               if not f.endswith(""):
    22                                             continue
    23
    24
    25   97.188 MiB -31114.238 MiB               filename = join(path, f)
    26   97.188 MiB -31074.410 MiB               ret[filename] = getmtime(filename)
    27   97.188 MiB -31111.219 MiB               i += 1
    28
    29                                 #\u5C06\u4E0A\u6B21\u7684\u6587\u4EF6\u540D\u96C6\u5408\u548C\u8FD9\u4E00\u6B21\u7684\u6587\u4EF6\u540D\u96C6\u5408\u5F04\u51FA\u6765
    30   97.188 MiB    0.000 MiB       prekeys = extRet.keys()
    31   97.188 MiB    0.000 MiB       nowkeys = ret.keys()
    32
    33                                 #\u68C0\u67E5\u5220\u9664\u7684\u6587\u4EF6
    34   97.188 MiB    0.000 MiB       delFile = prekeys - nowkeys
    35   97.195 MiB    0.008 MiB       print("\u5220\u9664\u7684\u6587\u4EF6{}".format(delFile))
    36
    37                                 #\u68C0\u67E5\u65B0\u589E\u7684\u6587\u4EF6
    38  111.578 MiB   14.383 MiB       newFile = nowkeys - prekeys
    39  114.059 MiB    2.480 MiB       print("\u65B0\u589E\u7684\u6587\u4EF6{}".format(newFile))
    40
    41  114.059 MiB    0.000 MiB       try:
    42  114.059 MiB    0.000 MiB           for filename in delFile:
    43                                         cursor.execute("delete from {} where fname=\'{}\'".format(tblName, filename))
    44                                         print("\u5220\u9664\u7684\u6587\u4EF6{}".format(filename))
    45                                 except:
    46                                     pass
    47
    48
    49  114.059 MiB    0.000 MiB       try:
    50  114.109 MiB    0.000 MiB           for filename in newFile:
    51  114.109 MiB    0.043 MiB               cursor.execute("insert into {} (fname, path,tm, tag) values (\'{}\',\'new\',\'{}\', \'study\')".format(tblName, filename.rstrip(), getmtime(filename)))
    52  114.109 MiB    0.008 MiB               print("\u65B0\u589E\u7684\u6587\u4EF6{}".format(filename))
    53  114.109 MiB    0.000 MiB       except:
    54  114.109 MiB    0.000 MiB           pass
    55
    56  114.109 MiB    0.000 MiB       cursor.close()
    57  114.109 MiB    0.000 MiB       conn.commit()
    58  114.109 MiB    0.000 MiB       conn.close()
    59  114.109 MiB    0.000 MiB       return ret
    
python -m memory_profiler setFileToSql.py  95.17s user 289.77s system 97% cpu 6:32.95 total

4. 观察数据库记录数目变化--230071(23万71条记录,确实比上次少了一个)

root@WJL-SH4031667:# sqlite3 test.db

sqlite3 hljs    3行

sqlite> select count(*) from fmanages;
230071

5. 效果图

3324959919019c4ae1bbb423c4b0e1e

enter description here

相关文章
相关标签/搜索