pandas dataframe 过滤——apply最灵活!!!
按照某特定string字段长度过滤:
import pandas as pd
df = pd.read_csv('filex.csv')
df['A'] = df['A'].astype('str')
df['B'] = df['B'].astype('str')
mask = (df['A'].str.len() == 10) & (df['B'].str.len() == 10)
df = df.loc[mask]
print(df)
Applied to filex.csv:
A,B
123,abc
1234,abcd
1234567890,abcdefghij
the code above prints
A B
2 1234567890 abcdefghij
或者是:
'''
遇到问题没人解答?小编创建了一个Python学习交流QQ群:857662006 寻找有志同道合的小伙伴,
互帮互助,群里还有不错的视频学习教程和PDF电子书!
'''
data={"names":["Alice","Zac","Anna","O"],"cars":["Civic","BMW","Mitsubishi","Benz"],
"age":["1","4","2","0"]}
df=pd.DataFrame(data)
"""
df:
age cars names
0 1 Civic Alice
1 4 BMW Zac
2 2 Mitsubishi Anna
3 0 Benz O
Then:
"""
df[
df['names'].apply(lambda x: len(x)>1) &
df['cars'].apply(lambda x: "i" in x) &
df['age'].apply(lambda x: int(x)<2)
]
"""
We will have :
age cars names
0 1 Civic Alice
"""
最灵活的是用apply:
def load_metadata(dir_name):
columns_index_list = [
MetaIndex.M_METADATA_ID_INDEX,
MetaIndex.M_SRC_IP_INDEX,
MetaIndex.M_DST_IP_INDEX,
MetaIndex.M_SRC_PORT_INDEX,
MetaIndex.M_DST_PORT_INDEX,
MetaIndex.M_PROTOCOL_INDEX,
MetaIndex.M_HEADER_H,
MetaIndex.M_PAYLOAD_H,
MetaIndex.M_TCP_FLAG_H,
MetaIndex.M_FLOW_FIRST_PKT_TIME,
MetaIndex.M_FLOW_LAST_PKT_TIME,
MetaIndex.M_OCTET_DELTA_COUNT_FROM_TOTAL_LEN,
]
columns_name_list = [
"M_METADATA_ID_INDEX",
"M_SRC_IP_INDEX",
"M_DST_IP_INDEX",
"M_SRC_PORT_INDEX",
"M_DST_PORT_INDEX",
"M_PROTOCOL_INDEX",
"M_HEADER_H",
"M_PAYLOAD_H",
"M_TCP_FLAG_H",
"M_FLOW_FIRST_PKT_TIME",
"M_FLOW_LAST_PKT_TIME",
"M_OCTET_DELTA_COUNT_FROM_TOTAL_LEN",
]
def metadata_parse_filter(row):
try:
if row['M_PROTOCOL_INDEX'] != 6:
return False
if len(row['M_HEADER_H']) < 2 or len(row['M_PAYLOAD_H']) < 2 or not is_l34_tcp_metadata(row['M_METADATA_ID_INDEX']):
return False
first_time = row['M_FLOW_FIRST_PKT_TIME'].split('-')
last_time = row['M_FLOW_LAST_PKT_TIME'].split('-')
flow_first_pkt_time = int(first_time[0])
rev_flow_first_pkt_time = int(first_time[1])
flow_last_pkt_time = int(last_time[0])
rev_flow_last_pkt_time = int(last_time[1])
if flow_first_pkt_time > flow_last_pkt_time or rev_flow_first_pkt_time > rev_flow_last_pkt_time:
return False
return True
except Exception as e:
return False
for root, dirs, files in os.walk(dir_name):
for filename in files:
file_path = os.path.join(root, filename)
df = pd.read_csv(file_path, delimiter='^', usecols=columns_index_list, names=columns_name_list, encoding='utf-8', error_bad_lines=False, warn_bad_lines=True, header=0, lineterminator="\n")
filter_df = df.loc[df.apply(metadata_parse_filter, axis=1)]
yield filter_df
直接按照row过滤!