Add new column to Pytables

h5是很好的保存文件的格式,该文件格式有一个优势是支持快速读取,但是在写入列上有些麻烦。比较好的写入新列的方式,是将新生成的数据写入到一个新的表中,然后删除原来的表。

以下代码就是实现上述功能(此后文章未加说明,均为在Windows+Python 3 上实现)

  1. # -*- coding: utf-8 -*-
  2. # read data from h5 file and add one colume, then write new data to a new file
  3. import tables as tb
  4. import pandas as pd
  5. import numpy as np
  6. import time
  7. import sys
  8. time0=time.time()
  9. pth=‘d:/download/’
  10. data_file=pth+‘data0.h5’
  11. #read data from the file
  12. filem=tb.open_file(data_file,mode=‘a’,driver=“H5FD_CORE”)
  13. df0=filem.get_node(where=‘/’,name=‘FinancialData’)
  14. df = pd.DataFrame.from_records(df0[:])
  15. df[‘UniqueorNot’]=np.where(df.duplicated(subset=[‘Date’,‘Code’]),‘Duplicated’,‘Unique’)
  16. # convert object to string, because table can not accept object
  17. def foo(atype):
  18.     if atype==np.object_:
  19.         return ‘S10’
  20.     return atype
  21. def df_dtype(df):
  22.     cols = df.columns
  23.     if  sys.version_info[0] == 2:  # python 2 needs .encode() but 3 does not
  24.         types = [(cols[i].encode(), foo(df[k].dtype.type)) for (i, k) in enumerate(cols)]
  25.     else:
  26.         types = [(cols[i], foo(df[k].dtype.type)) for (i, k) in enumerate(cols)]
  27.     dtype = np.dtype(types)
  28.     return dtype
  29. dty = df_dtype(df)
  30. # write to a new PyTables table
  31. data_file=pth+‘data1.h5’
  32. data_h5 = tb.open_file(data_file, ‘a’)
  33. # make a definiton of creating Pytables
  34. def c_table(f,tname,tdty,data):
  35.     try:
  36.         f.create_table(where=‘/’, name=tname, description=tdty)
  37. #    if the table has already been created, then do nothing
  38.     except tb.NodeError:
  39.         print(\n, tname, ‘is existing’)
  40.         pass
  41. #    get data from table
  42.     c=f.get_node(where=‘/’,name=tname)
  43. #   data can be list, and internal type is tuple
  44.     ts = [tuple(s) for s in data]
  45.     c.append(rows=ts)
  46.     c.flush()
  47. # if you do not want to save the file to csv, you can drop the following two rows.
  48. #    df = pd.DataFrame.from_records(c[:])
  49. #    df.to_csv(pth+’data1.csv’)
  50. # write new data to file
  51. namelist=[‘FinancialData’]
  52. for c in namelist:
  53.     c_table(data_h5,c,dty,df.as_matrix())
  54. #flush and close the file
  55. data_h5.flush()
  56. data_h5.close()
  57. # check the time elapsed
  58. time2=time.time()
  59. print(\n%8.4fs’ %(time2-time0))