Chinese character in Jupyter
import matplotlib
matplotlib.matplotlib_fname()
!mv simhei.ttf /home/pai/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf/
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
Reference:
– https://www.freeaihub.com/post/110531.html
– https://stackoverflow.com/questions/42097053/matplotlib-cannot-find-basic-fonts
Jupyter notebook setting
# limit the number of rows that pandas dataframe prints
pd.set_option('display.max_rows', 100)
# ignore warning messages
import warnings
warnings.filterwarnings('ignore')
# improve figure resolution
%config InlineBackend.figure_format = "retina"
Python ODPS
Read table from ODPS with parallel processing
# configure environment
import odps
import sys; sys.path.append("/home/admin/workspace")
from utility import read_records, to_pandas
odps_config = {"access_id": 'access_id',
"access_key": 'access_key',
"project": 'cnalgo_dev',
"endpoint": 'http://service-corp.odps.aliyun-inc.com/api'}
o = odps.ODPS(odps_config["access_id"], odps_config["access_key"], odps_config["project"], odps_config["endpoint"])
%%time
# read tables from ODPS
input_table = 'cnalgo_sta_repeat_customers_random_subsample_treat_control_union'
df = to_pandas(input_table, o, N_WORKERS=8)
Write table into ODPS
from utility import odps_writer
target_table = 'cnalgo_sta_repeat_customers_random_subsample_treat'
output_table = buyers_treat_sampled[['biz_buyer_id']]#.apply(lambda x: str(x.biz_buyer_id), axis=1)
odps_writer(output_table, o.get_table(target_table))
import pandas as pd
from odps.tunnel import TableTunnel
from multiprocessing import Pool
def read_records(tunnel, table, session_id, start, count, columns):
local_session = tunnel.create_download_session(table.name, download_id=session_id)
result = {}
for col in range(len(columns)):
result[columns[col]] = []
with local_session.open_record_reader(start, count) as reader:
for record in reader:
for i in range(len(record.values)):
result[columns[i]] += [record.values[i]]
df = pd.DataFrame(result)
return df
def to_pandas(table_name, o, N_WORKERS=3):
import math
table = o.get_table(table_name)
tunnel = TableTunnel(o)
download_session = tunnel.create_download_session(table.name)
session_id = download_session.id
pool = Pool(processes=N_WORKERS)
chunks = []
count = table.open_reader().count
if count < N_WORKERS:
N_WORKERS = 1
chunk_count = math.floor(count / N_WORKERS)
columns = table.schema.names
for i in range(N_WORKERS):
start_i = i * chunk_count
count_i = (count - i * chunk_count) if (i + 1) == N_WORKERS else chunk_count
chunks.append(pool.apply_async(read_records, (tunnel, table, session_id, start_i, count_i, columns)))
pool.close()
pool.join()
df = pd.concat([c.get() for c in chunks])
df = df.reset_index(drop=True)
return df
def odps_writer(df, t):
with t.open_writer() as writer:
writer.write(df.values.tolist())