0%

pandas常用操作

显示所有列

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

df有两列索引a/b,两列数据列c/d。

  • 将df按照a分组,组内按照b从小到大排序,若c列前一行值不为nan则将c列按照前一个值填充
df_sorted = df.sort_index(level=['a', 'b'])  

def fill_group(group):
group['c'] = group['c'].fillna(method='ffill')
return group

df_result = df_sorted.groupby(level='a', group_keys=False).apply(fill_group)
  • 前后遍历补充marketcap值
def ffill_group(group):
group['ratio'] = group['open'] / group['open'].shift(1)
last_valid_marketcap = None
for index, row in group.iterrows():
if pd.isna(row['marketcap']):
if last_valid_marketcap is not None and not pd.isna(row['ratio']):
group.at[index, 'marketcap'] = last_valid_marketcap * row['ratio']
if not pd.isna(group.at[index, 'marketcap']):
last_valid_marketcap = group.at[index, 'marketcap']
group = group.drop('ratio', axis=1)
return group


def bfill_group(group):
group['reverse_ratio'] = group['open'] / group['open'].shift(-1)
last_valid_marketcap = None
for index in reversed(group.index):
row = group.loc[index]
if pd.isna(row['marketcap']):
if last_valid_marketcap is not None and not pd.isna(row['reverse_ratio']):
# 如果当前 marketcap 是 NaN,且我们有下一个有效值和有效的反向比率
group.at[index, 'marketcap'] = last_valid_marketcap / row['reverse_ratio']
if not pd.isna(group.at[index, 'marketcap']):
last_valid_marketcap = group.at[index, 'marketcap']
group = group.drop('reverse_ratio', axis=1)
return group
  • 删除重复索引的行,保留第一次出现的行
df_no_duplicates = df.loc[~df.index.duplicated(keep='first')]  
  • 假设a列是时间类型,计算每个 b 分组内 c 列的每日总和
# 按天分组并聚合  
# 使用 groupby 和 resample 来计算每个 b 分组内 c 列的每日总和
result = df.groupby(level='b')['c'].resample('D', level='a').sum()

# 若是两列
result = df.groupby(level='b').agg({
'c': lambda x: x.resample('D', level='a').sum(),
'd': lambda x: x.resample('D', level='a').sum()
})
  • 如果 df2 中没有对应的索引,df1 中的原值会被保留
# 使用 update 方法更新 df1  
df1.update(df2[['net_inflow']])
  • 筛选出某一行
# 1. 使用完整的索引值  
print("\n1. 选择 a=1, b='x' 的行:")
print(df.loc[(1, 'x')])

# 2. 使用部分索引值
print("\n2. 选择 a=1 的所有行:")
print(df.loc[1])

# 3. 使用切片
print("\n3. 选择 a 在 1 到 2 之间的所有行:")
print(df.loc[1:2])

# 4. 使用布尔索引
print("\n4. 选择 a 大于 1 的所有行:")
print(df.loc[df.index.get_level_values('a') > 1])

# 5. 使用 xs() 方法
print("\n5. 选择 b='y' 的所有行:")
print(df.xs('y', level='b'))
  • 左连接
# 使用 merge 函数进行基于索引的左连接  
result = df1.merge(df2, left_index=True, right_index=True, how='left')
  • 打印不同
index_diff = set(df1.index).symmetric_difference(set(df2.index))
print("\n差异:")
print(index_diff)

读取txt为csv

import pandas as pd

def txt_to_csv(_txt_file_path,_csv_file_path,_columns):
result = []
with open(_txt_file_path, 'r') as file:
for line in file:
# 去除行末的换行符,然后按制表符分割
row = line.strip().split('\t')
result.append(row)
def array_to_csv(data, csv_file_path, columns):
df = pd.DataFrame(data, columns=columns)
df.to_csv(csv_file_path, index=False)
array_to_csv(result,_csv_file_path,_columns)


txt_to_csv('./raw_data/txt/top_position_lsr.txt'
,'./raw_data/csv/top_position_lsr.csv'
,['timestamp','symbol','top_position_lsr'])

df打印有空值的行

na_df = df[df.isnull().any(axis=1)]

df改变列名

fr = pd.read_csv('raw_data/csv/fundingrate.csv')
fr['symbol'] = fr['symbol'].str[:] + '/USDT:USDT'

df时间戳列改日期

df['timestamp'] = df['timestamp'].astype(int)  

# 将10位时间戳转换为datetime对象
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

# 将13位时间戳转换为datetime对象
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

df遍历所有行判断某列是否为NaN

for index,row in df.iterrows():
if pd.isna(row['funding_rate_instant']):
try:
df1.loc[index, 'top_position_lsr'] = df2.loc[index, 'top_position_lsr']
except Exception as e:
pass