Multiprocessing

最近在写一个代码,感觉要run很久,在群里问,有人建议采用multiprocessing。开始看官方文档。然而有一个程序却测试跑不出来。

# -*- coding: utf-8 -*-
"""
Created on Sun Jul 30 19:42:12 2017
multiprocessing_test
"""

import multiprocessing as mp

def f(name):
      print('hello', name)

if __name__ == '__main__':
       p = mp.Process(target=f, args=('bob',))
       p.start()
       p.join()
# 在anaconda的spyder中,一定要有这一句才ok。
# 但是在fluent python中,不需要这一句
      p.run() #output为 hello bob

#下面的程序运行正常
def f2(x):
     return x**x

if __name__ == '__main__':
     with mp.Pool(5) as p:
            print(p.map(f2, [1, 2, 3])) #output为[1, 4, 9]
Print Friendly, PDF & Email

groupby_fillna_for special columns

Well, there is no problem to merge data and fillna.

sometimes we want to fillna with groupby, more specifically, we need to fillna with different methods, some are ffill, while others are fillna(0).

the following code can deal with this situation.

# -*- coding: utf-8 -*-
“””
groupby_fillna_special columns
https://stackoverflow.com/questions/21284585
“””

import pandas as pd
from io import StringIO

# fillna with groupby
csvdata=”””
STK_ID RPT_Date sales opr_pft net_pft
002138 20130331 2.0703 0.3373 0.2829
002138 20130630 NaN NaN NaN
002138 20130930 7.4993 1.2248 1.1630
002138 20140122 NaN NaN NaN

600004 20130331 11.8429 3.0816 2.1637
600004 20130630 24.6232 6.2152 4.5135
600004 20130930 37.9673 9.2088 6.6463
600004 20140122 NaN NaN NaN

600809 20130331 27.9517 9.9426 7.5182
600809 20130630 40.6460 13.9414 9.8572
600809 20130930 53.0501 16.8081 11.8605
600809 20140122 NaN NaN NaN
“””
sio=StringIO(csvdata)

df=pd.read_csv(sio, sep=’\s+’,header=0,dtype={“STK_ID”:object,”RPT_Date”:object,
“sales”:float,’apr_pft’:float,
‘net_pft’:float})
df.set_index([‘STK_ID’,’RPT_Date’],inplace=True)

#ffill all columns with groupby
#DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)[source]
df.groupby(level=0).apply(lambda grp: grp.fillna(method=’ffill’))

#fillnow the lastrow with groupby
def f(g):
last = len(g.values)-1
g.iloc[last,:] = g.iloc[last-1,:]
return g
df.groupby(level=0).apply(f)

# fill certain columns
df[[‘sales’,’opr_pft’]]=df.groupby(level=0)[[‘sales’,’opr_pft’]].fillna(method=’ffill’)
#or you can write as following
df[[‘sales’,’opr_pft’]]=df.groupby(level=0)[[‘sales’,’opr_pft’]].apply(lambda grp: grp.fillna(method=’ffill’))

Print Friendly, PDF & Email

upsampling with multiindex

I want to resampling(more specifically, upsampling) data with multiindex.

Thanks for stackoverflow, https://stackoverflow.com/questions/39737890/, I find a solution.  the code has some minor error, I have corret it.

# -*- coding: utf-8 -*-
“””
upsampling with multiindex
“””

import pandas as pd
import datetime
import numpy as np
np.random.seed(1234)

arrays = [np.sort([datetime.date(2016, 8, 31),
datetime.date(2016, 7, 31),
datetime.date(2016, 6, 30)]*5),
[‘A’, ‘B’, ‘C’, ‘D’, ‘E’]*3]
df = pd.DataFrame(np.random.randn(15, 4), index=arrays)
df.index.rename([‘date’, ‘id’], inplace=True)
print(df)

df1=df.unstack().resample(‘W-FRI’).last().ffill().stack()
print(‘\n df1:\n’,df1)

the result is as following:

                         0                    1                2                 3
date            id
2016-06-30 A 0.471435 -1.190976 1.432707 -0.312652
B -0.720589 0.887163 0.859588 -0.636524
C 0.015696 -2.242685 1.150036 0.991946
D 0.953324 -2.021255 -0.334077 0.002118
E 0.405453 0.289092 1.321158 -1.546906
2016-07-31 A -0.202646 -0.655969 0.193421 0.553439
B 1.318152 -0.469305 0.675554 -1.817027
C -0.183109 1.058969 -0.397840 0.337438
D 1.047579 1.045938 0.863717 -0.122092
E 0.124713 -0.322795 0.841675 2.390961
2016-08-31 A 0.076200 -0.566446 0.036142 -2.074978
B 0.247792 -0.897157 -0.136795 0.018289
C 0.755414 0.215269 0.841009 -1.445810
D -1.401973 -0.100918 -0.548242 -0.144620
E 0.354020 -0.035513 0.565738 1.545659

df1:
0              1                2                 3
date            id
2016-07-01 A 0.471435 -1.190976 1.432707 -0.312652
B -0.720589 0.887163 0.859588 -0.636524
C 0.015696 -2.242685 1.150036 0.991946
D 0.953324 -2.021255 -0.334077 0.002118
E 0.405453 0.289092 1.321158 -1.546906
2016-07-08 A 0.471435 -1.190976 1.432707 -0.312652
B -0.720589 0.887163 0.859588 -0.636524
C 0.015696 -2.242685 1.150036 0.991946
D 0.953324 -2.021255 -0.334077 0.002118
E 0.405453 0.289092 1.321158 -1.546906
2016-07-15 A 0.471435 -1.190976 1.432707 -0.312652
B -0.720589 0.887163 0.859588 -0.636524
C 0.015696 -2.242685 1.150036 0.991946
D 0.953324 -2.021255 -0.334077 0.002118
E 0.405453 0.289092 1.321158 -1.546906
2016-07-22 A 0.471435 -1.190976 1.432707 -0.312652
B -0.720589 0.887163 0.859588 -0.636524
C 0.015696 -2.242685 1.150036 0.991946
D 0.953324 -2.021255 -0.334077 0.002118
E 0.405453 0.289092 1.321158 -1.546906
2016-07-29 A 0.471435 -1.190976 1.432707 -0.312652
B -0.720589 0.887163 0.859588 -0.636524
C 0.015696 -2.242685 1.150036 0.991946
D 0.953324 -2.021255 -0.334077 0.002118
E 0.405453 0.289092 1.321158 -1.546906
2016-08-05 A -0.202646 -0.655969 0.193421 0.553439
B 1.318152 -0.469305 0.675554 -1.817027
C -0.183109 1.058969 -0.397840 0.337438
D 1.047579 1.045938 0.863717 -0.122092
E 0.124713 -0.322795 0.841675 2.390961
2016-08-12 A -0.202646 -0.655969 0.193421 0.553439
B 1.318152 -0.469305 0.675554 -1.817027
C -0.183109 1.058969 -0.397840 0.337438
D 1.047579 1.045938 0.863717 -0.122092
E 0.124713 -0.322795 0.841675 2.390961
2016-08-19 A -0.202646 -0.655969 0.193421 0.553439
B 1.318152 -0.469305 0.675554 -1.817027
C -0.183109 1.058969 -0.397840 0.337438
D 1.047579 1.045938 0.863717 -0.122092
E 0.124713 -0.322795 0.841675 2.390961
2016-08-26 A -0.202646 -0.655969 0.193421 0.553439
B 1.318152 -0.469305 0.675554 -1.817027
C -0.183109 1.058969 -0.397840 0.337438
D 1.047579 1.045938 0.863717 -0.122092
E 0.124713 -0.322795 0.841675 2.390961
2016-09-02 A 0.076200 -0.566446 0.036142 -2.074978
B 0.247792 -0.897157 -0.136795 0.018289
C 0.755414 0.215269 0.841009 -1.445810
D -1.401973 -0.100918 -0.548242 -0.144620
E 0.354020 -0.035513 0.565738 1.545659

Print Friendly, PDF & Email

read h5 and remove prefix ‘b’

h5文件是存储大数据的一种较好格式,在读取该文件的时候,有时候会有prefix ‘b’.因此,要设法去除该prefix。

# -*- coding: utf-8 -*-
import tables as tb
import pandas as pd
import numpy as np
import time

time0=time.time()
pth=’d:/download/’

# 读取交易的数据
data_trading=pth+’Trading_v01.h5′
filem=tb.open_file(data_trading,mode=’a’,driver=”H5FD_CORE”)
tb_trading=filem.get_node(where=’/’, name=’wind_data’)
df=pd.DataFrame.from_records(tb_trading[:])
time1=time.time()
print(‘\ntime on reading data: %6.3fs’ %(time1-time0))
# in python3, remove prefix ‘b’
for key in [‘Date’, ‘Code’]:
df[key] = df[key].str.decode(“utf-8”)

# the following two methods are
# method 1
#str_df = df.loc[:,[‘Date’,’Code’]]
#str_df = str_df.stack().str.decode(‘utf-8′).unstack()
#for col in str_df:
# df[col] = str_df[col]
#method 2
#df.loc[:,’Date’]=[[dt.decode(‘utf-8′)] for dt in df.loc[:,’Date’]]
#df.loc[:,’Code’]=[[cd.decode(‘utf-8′)] for cd in df.loc[:,’Code’]]

time2=time.time()
print(“\ntime on removing prefix ‘b’: %6.3fs” %(time2-time1))
print(‘\ntotal time: %6.3fs’ %(time2-time0))

运行结果如下

time on reading data: 1.508s

time on removing prefix ‘b’: 9.315s

total time: 10.823s

不知道有没有更快捷的方式。

Print Friendly, PDF & Email

Add new column to Pytables

h5是很好的保存文件的格式,该文件格式有一个优势是支持快速读取,但是在写入列上有些麻烦。比较好的写入新列的方式,是将新生成的数据写入到一个新的表中,然后删除原来的表。

以下代码就是实现上述功能(此后文章未加说明,均为在Windows+Python 3 上实现)

  1. # -*- coding: utf-8 -*-
  2. # read data from h5 file and add one colume, then write new data to a new file
  3. import tables as tb
  4. import pandas as pd
  5. import numpy as np
  6. import time
  7. import sys
  8. time0=time.time()
  9. pth=‘d:/download/’
  10. data_file=pth+‘data0.h5’
  11. #read data from the file
  12. filem=tb.open_file(data_file,mode=‘a’,driver=“H5FD_CORE”)
  13. df0=filem.get_node(where=‘/’,name=‘FinancialData’)
  14. df = pd.DataFrame.from_records(df0[:])
  15. df[‘UniqueorNot’]=np.where(df.duplicated(subset=[‘Date’,‘Code’]),‘Duplicated’,‘Unique’)
  16. # convert object to string, because table can not accept object
  17. def foo(atype):
  18.     if atype==np.object_:
  19.         return ‘S10’
  20.     return atype
  21. def df_dtype(df):
  22.     cols = df.columns
  23.     if  sys.version_info[0] == 2:  # python 2 needs .encode() but 3 does not
  24.         types = [(cols[i].encode(), foo(df[k].dtype.type)) for (i, k) in enumerate(cols)]
  25.     else:
  26.         types = [(cols[i], foo(df[k].dtype.type)) for (i, k) in enumerate(cols)]
  27.     dtype = np.dtype(types)
  28.     return dtype
  29. dty = df_dtype(df)
  30. # write to a new PyTables table
  31. data_file=pth+‘data1.h5’
  32. data_h5 = tb.open_file(data_file, ‘a’)
  33. # make a definiton of creating Pytables
  34. def c_table(f,tname,tdty,data):
  35.     try:
  36.         f.create_table(where=‘/’, name=tname, description=tdty)
  37. #    if the table has already been created, then do nothing
  38.     except tb.NodeError:
  39.         print(\n, tname, ‘is existing’)
  40.         pass
  41. #    get data from table
  42.     c=f.get_node(where=‘/’,name=tname)
  43. #   data can be list, and internal type is tuple
  44.     ts = [tuple(s) for s in data]
  45.     c.append(rows=ts)
  46.     c.flush()
  47. # if you do not want to save the file to csv, you can drop the following two rows.
  48. #    df = pd.DataFrame.from_records(c[:])
  49. #    df.to_csv(pth+’data1.csv’)
  50. # write new data to file
  51. namelist=[‘FinancialData’]
  52. for c in namelist:
  53.     c_table(data_h5,c,dty,df.as_matrix())
  54. #flush and close the file
  55. data_h5.flush()
  56. data_h5.close()
  57. # check the time elapsed
  58. time2=time.time()
  59. print(\n%8.4fs’ %(time2-time0))
Print Friendly, PDF & Email