文本处理学习笔记5
环境如下:
python-2.5.2
工作中经常遇到类似csv格式的文件文件,
为了处理起来方便,一般要导入到数据库中
MYSQL导入工具代码:
# coding:UTF-8 import osimport sysimport timeimport MySQLdbperread = 1048576 # 1Moriginal = 32size_list = []for i in range(125): size_list.append(original) original = original + 32def getSize(aName, aSize): for i in size_list: if aSize<i: return i raise ValueError("字段太长 '%s' %s"%(aName, aSize))def loadCSV(fileName, separtor=""): print "loadCSV", fileName, "..." tf = file(fileName, "r") connMys = MySQLdb.connect(host="127.0.0.1", user="root", passwd="pass", db="test", charset="UTF8") begin = time.time() mark = 0 count = 0 currLines = tf.readlines(perread) count = count + len(currLines) - 1 currTable = [i.rstrip().split(separtor) for i in currLines if len(i.rstrip())>0] columnName = currTable[0] for i in columnName: if not i: raise ValueError("字段名不能为空!") columnCount = len(columnName) currTable = fillSpace(currTable, columnCount) currTable = distinctColumn(currTable) aName = os.path.split(fileName)[1].split(".")[0] dropSQL, createSQL, insertSQL = buildCreate(aName, currTable) cursorMys = connMys.cursor() print dropSQL, ";" cursorMys.execute(dropSQL) print createSQL, ";" cursorMys.execute(createSQL) connMys.commit() print insertSQL, ";" for i in currTable[1:]: cursorMys.execute(insertSQL, i) connMys.commit() print "insert", count, "lines." while tf.tell() > mark: mark = tf.tell() currLines = tf.readlines(perread) if not tf.tell() > mark: break count = count + len(currLines) currTable = [i.rstrip().split(separtor) for i in currLines if len(i.rstrip())>0] currTable = fillSpace(currTable, columnCount) for i in currTable: cursorMys.execute(insertSQL, i) connMys.commit() print "insert", count, "lines." end = time.time() print "Count:%s Time:%s"%(count, end - begin) cursorMys.close() connMys.close() tf.close()def fillSpace(aTable, colCount): for ii, iv in enumerate(aTable): while len(iv) < colCount: iv.append("") if len(iv) != colCount: raise ValueError("字段不够多!" + str(iv)) return aTabledef distinctColumn(aTable): columnNames = aTable[0] ucolumnNames = set(columnNames) if len(columnNames)==len(ucolumnNames): return aTable tcinclude = [] tcexclude = [] for i, v in enumerate(columnNames): if v in tcinclude: print "字段竟然重复", v tcexclude.append(i) else: tcinclude.append(v) result = [] for i in aTable: result.append([v for j, v in enumerate(i) if j not in tcexclude]) return resultdef buildCreate(aName, aTable): columnNames = aTable[0] columnSizes = [getSize(k, len(v)) for k, v in zip(aTable[0], aTable[1])] for i in aTable[2:]: for j, v in enumerate(i): if not len(v)<columnSizes[j]: columnSizes[j] = getSize(columnNames[j], len(v)) dropSQL = "DROP TABLE IF EXISTS " + aName createSQL = ["CREATE TABLE IF NOT EXISTS " + aName] createSQL.append("(") for k, v in zip(columnNames, columnSizes): currLines = " %s VARCHAR(%s),"%(k, v) createSQL.append(currLines) createSQL[-1] = " %s VARCHAR(%s)"%(k, v) createSQL.append(")") k = ", ".join(columnNames) v = ", ".join(["%s" for i in range(len(columnNames))]) insertSQL = "INSERT INTO %s (%s) VALUES (%s)"%(aName, k, v) return (dropSQL, "\n".join(createSQL), insertSQL)def usage(): print "CSVloader -h" print "CSVloader -f bsc.csv bts.csv" print "CSVloader -d dir1 dir2" print "CSVloader -f bsc.csv bts.csv -s ," print "CSVloader -d dir1 dir2 -s ," print "CSVloader -s , -f bsc.csv bts.csv" print "CSVloader -s , -d dir1 dir2" print " Options include:" print " -h [help] - 打印帮助" print " -s [separtor] - 使用的分割符,默认 tab" print " -f [fileName] - 文件名,可以指定多个" print " -d [dirName] - 文件夹,可以指定多个" print print " 注意:文件必须是 UTF-8 编码"if __name__ == "__main__": if "-h" in sys.argv: usage() elif "-d" in sys.argv: separtor = "" idxf = sys.argv.index("-d") if "-s" in sys.argv: idxs = sys.argv.index("-s") separtor = sys.argv[idxs+1] if idxs>idxf: for i in sys.argv[idxf+1:idxs]: for j in os.listdir(i): loadCSV(os.path.join(i, j), separtor) else: for i in sys.argv[idxf+1:]: for j in os.listdir(i): loadCSV(os.path.join(i, j), separtor) else: for i in sys.argv[idxf+1:]: for j in os.listdir(i): loadCSV(os.path.join(i, j), separtor) elif "-f" in sys.argv: separtor = "" idxf = sys.argv.index("-f") if "-s" in sys.argv: idxs = sys.argv.index("-s") separtor = sys.argv[idxs+1] if idxs>idxf: for i in sys.argv[idxf+1:idxs]: loadCSV(i, separtor) else: for i in sys.argv[idxf+1:]: loadCSV(i, separtor) else: for i in sys.argv[idxf+1:]: loadCSV(i, separtor) else: usage()@echo offpython CSVloader.py %*
oid,city,type,pid7,邢台市,103,78,廊坊市,103,81,沧州市,103,13,高邑县,104,44,河间市,104,12,赵县,104,4
# coding:GBK import osimport sysimport timeimport cx_Oracleperread = 1048576 # 1M# original = 64# size_list = []# while original<4000:# size_list.append(original)# original = original*2original = 32size_list = []for i in range(125): size_list.append(original) original = original + 32def getSize(aName, aSize): for i in size_list: if aSize<i: return i raise ValueError("字段太长 '%s' %s"%(aName, aSize))def loadCSV(fileName, separtor=""): print "loadCSV", fileName, "..." tf = file(fileName, "r") connOra = cx_Oracle.connect("user", "pass", "192.168.1.110:1521/testdb") begin = time.time() mark = 0 count = 0 currLines = tf.readlines(perread) count = count + len(currLines) - 1 currTable = [i.rstrip().split(separtor) for i in currLines if len(i.rstrip())>0] columnName = currTable[0] for i in columnName: if not i: raise ValueError("字段名不能为空!") columnCount = len(columnName) currTable = fillSpace(currTable, columnCount) currTable = distinctColumn(currTable) aName = os.path.split(fileName)[1].split(".")[0] createSQL, insertSQL = buildCreate(aName, currTable) cursorOra = connOra.cursor() print createSQL, ";" cursorOra.execute(createSQL) connOra.commit() print insertSQL, ";" cursorOra.prepare(insertSQL) cursorOra.executemany(None, currTable[1:]) connOra.commit() print "insert", count, "lines." while tf.tell() > mark: mark = tf.tell() currLines = tf.readlines(perread) if not tf.tell() > mark: break count = count + len(currLines) currTable = [i.rstrip().split(separtor) for i in currLines if len(i.rstrip())>0] currTable = fillSpace(currTable, columnCount) cursorOra.executemany(None, currTable) connOra.commit() print "insert", count, "lines." end = time.time() print "Count:%s Time:%s"%(count, end - begin) cursorOra.close() connOra.close() tf.close()def fillSpace(aTable, colCount): for ii, iv in enumerate(aTable): while len(iv) < colCount: iv.append("") if len(iv) != colCount: raise ValueError("字段不够多!" + str(iv)) return aTabledef distinctColumn(aTable): columnNames = aTable[0] ucolumnNames = set(columnNames) if len(columnNames)==len(ucolumnNames): return aTable tcinclude = [] tcexclude = [] for i, v in enumerate(columnNames): if v in tcinclude: print "字段竟然重复", v tcexclude.append(i) else: tcinclude.append(v) result = [] for i in aTable: result.append([v for j, v in enumerate(i) if j not in tcexclude]) return resultdef buildCreate(aName, aTable): columnNames = aTable[0] columnSizes = [getSize(k, len(v)) for k, v in zip(aTable[0], aTable[1])] for i in aTable[2:]: for j, v in enumerate(i): if not len(v)<columnSizes[j]: columnSizes[j] = getSize(columnNames[j], len(v)) createSQL = ["CREATE TABLE " + aName] createSQL.append("(") for k, v in zip(columnNames, columnSizes): currLines = " %s VARCHAR2(%s),"%(k, v) createSQL.append(currLines) createSQL[-1] = " %s VARCHAR2(%s)"%(k, v) createSQL.append(")") k = ", ".join(columnNames) v = ", ".join([":%s"%(i+1) for i in range(len(columnNames))]) insertSQL = "INSERT INTO %s (%s) VALUES (%s)"%(aName, k, v) return ("\n".join(createSQL), insertSQL)def usage(): print "CSVloader -h" print "CSVloader -f bsc.csv bts.csv" print "CSVloader -d dir1 dir2" print "CSVloader -f bsc.csv bts.csv -s ," print "CSVloader -d dir1 dir2 -s ," print "CSVloader -s , -f bsc.csv bts.csv" print "CSVloader -s , -d dir1 dir2" print " Options include:" print " -h [help] - 打印帮助" print " -s [separtor] - 使用的分割符,默认 tab" print " -f [fileName] - 文件名,可以指定多个" print " -d [dirName] - 文件夹,可以指定多个" print print " 注意:文件必须是 GBK 编码"if __name__ == "__main__": if "-h" in sys.argv: usage() elif "-d" in sys.argv: separtor = "" idxf = sys.argv.index("-d") if "-s" in sys.argv: idxs = sys.argv.index("-s") separtor = sys.argv[idxs+1] if idxs>idxf: for i in sys.argv[idxf+1:idxs]: for j in os.listdir(i): loadCSV(os.path.join(i, j), separtor) else: for i in sys.argv[idxf+1:]: for j in os.listdir(i): loadCSV(os.path.join(i, j), separtor) else: for i in sys.argv[idxf+1:]: for j in os.listdir(i): loadCSV(os.path.join(i, j), separtor) elif "-f" in sys.argv: separtor = "" idxf = sys.argv.index("-f") if "-s" in sys.argv: idxs = sys.argv.index("-s") separtor = sys.argv[idxs+1] if idxs>idxf: for i in sys.argv[idxf+1:idxs]: loadCSV(i, separtor) else: for i in sys.argv[idxf+1:]: loadCSV(i, separtor) else: for i in sys.argv[idxf+1:]: loadCSV(i, separtor) else: usage()