h5是很好的保存文件的格式,该文件格式有一个优势是支持快速读取,但是在写入列上有些麻烦。比较好的写入新列的方式,是将新生成的数据写入到一个新的表中,然后删除原来的表。
以下代码就是实现上述功能(此后文章未加说明,均为在Windows+Python 3 上实现)
# -*- coding: utf-8 -*- # read data from h5 file and add one colume, then write new data to a new file import tables as tb import pandas as pd import numpy as np import time import sys time0=time.time() pth=‘d:/download/’ data_file=pth+‘data0.h5’ #read data from the file filem=tb.open_file(data_file,mode=‘a’,driver=“H5FD_CORE”) df0=filem.get_node(where=‘/’,name=‘FinancialData’) df = pd.DataFrame.from_records(df0[:]) df[‘UniqueorNot’]=np.where(df.duplicated(subset=[‘Date’,‘Code’]),‘Duplicated’,‘Unique’) # convert object to string, because table can not accept object def foo(atype): if atype==np.object_: return ‘S10’ return atype def df_dtype(df): cols = df.columns if sys.version_info[0] == 2: # python 2 needs .encode() but 3 does not types = [(cols[i].encode(), foo(df[k].dtype.type)) for (i, k) in enumerate(cols)] else: types = [(cols[i], foo(df[k].dtype.type)) for (i, k) in enumerate(cols)] dtype = np.dtype(types) return dtype dty = df_dtype(df) # write to a new PyTables table data_file=pth+‘data1.h5’ data_h5 = tb.open_file(data_file, ‘a’) # make a definiton of creating Pytables def c_table(f,tname,tdty,data): try: f.create_table(where=‘/’, name=tname, description=tdty) # if the table has already been created, then do nothing except tb.NodeError: print(‘\n‘, tname, ‘is existing’) pass # get data from table c=f.get_node(where=‘/’,name=tname) # data can be list, and internal type is tuple ts = [tuple(s) for s in data] c.append(rows=ts) c.flush() # if you do not want to save the file to csv, you can drop the following two rows. # df = pd.DataFrame.from_records(c[:]) # df.to_csv(pth+’data1.csv’) # write new data to file namelist=[‘FinancialData’] for c in namelist: c_table(data_h5,c,dty,df.as_matrix()) #flush and close the file data_h5.flush() data_h5.close() # check the time elapsed time2=time.time() print(‘\n%8.4fs’ %(time2-time0))