2017年7月22日 – Renke's 所见所思

h5是很好的保存文件的格式，该文件格式有一个优势是支持快速读取，但是在写入列上有些麻烦。比较好的写入新列的方式，是将新生成的数据写入到一个新的表中，然后删除原来的表。

以下代码就是实现上述功能（此后文章未加说明，均为在Windows+Python 3 上实现）

# -*- coding: utf-8 -*-

# read data from h5 file and add one colume, then write new data to a new file

import tables as tb

import pandas as pd

import numpy as np

import time

import sys

time0=time.time()

pth=‘d:/download/’

data_file=pth+‘data0.h5’

#read data from the file

filem=tb.open_file(data_file,mode=‘a’,driver=“H5FD_CORE”)

df0=filem.get_node(where=‘/’,name=‘FinancialData’)

df = pd.DataFrame.from_records(df0[:])

df[‘UniqueorNot’]=np.where(df.duplicated(subset=[‘Date’,‘Code’]),‘Duplicated’,‘Unique’)

# convert object to string, because table can not accept object

def foo(atype):

  if atype==np.object_:

  return ‘S10’

  return atype

def df_dtype(df):

cols = df.columns

  if  sys.version_info[0] == 2: # python 2 needs .encode() but 3 does not

  types = [(cols[i].encode(), foo(df[k].dtype.type)) for (i, k) in enumerate(cols)]

  else:

  types = [(cols[i], foo(df[k].dtype.type)) for (i, k) in enumerate(cols)]

dtype = np.dtype(types)

  return dtype

dty = df_dtype(df)

# write to a new PyTables table

data_file=pth+‘data1.h5’

data_h5 = tb.open_file(data_file, ‘a’)

# make a definiton of creating Pytables

def c_table(f,tname,tdty,data):

  try:

f.create_table(where=‘/’, name=tname, description=tdty)

# if the table has already been created, then do nothing

  except tb.NodeError:

  print(‘\n‘, tname, ‘is existing’)

  pass

# get data from table

c=f.get_node(where=‘/’,name=tname)

# data can be list, and internal type is tuple

ts = [tuple(s) for s in data]

c.append(rows=ts)

c.flush()

# if you do not want to save the file to csv, you can drop the following two rows.

# df = pd.DataFrame.from_records(c[:])

# df.to_csv(pth+’data1.csv’)

# write new data to file

namelist=[‘FinancialData’]

for c in namelist:

c_table(data_h5,c,dty,df.as_matrix())

#flush and close the file

data_h5.flush()

data_h5.close()

# check the time elapsed

time2=time.time()

print(‘\n%8.4fs’ %(time2-time0))

日	一	二	三	四	五	六
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28	29
30	31

每日归档： 2017年7月22日

Add new column to Pytables