如何通过避免python中的循环来优化dataframe上的操作？

import pandas as pd keys = ['Address', 'CoordinateX', 'CoordinateY'] values = [['Addr1', 'Addr2', 'Addr3'], [0, 1, 1], [9, 2, 1]] addresses = dict(zip(keys, values)) df = pd.DataFrame(addresses, columns=keys) R = 1

df_X = pd.DataFrame() # to fill with counts for idx, row in df.iterrows(): x1, y1 = row['CoordinateX'], row['CoordinateY'] addr_count = 0 indices = [] # to collect idx2 for dropping in df_X df2 = df.copy() for idx2, row2 in df2.iterrows(): x2, y2 = row2['Longitude'], row2['Lattitude'] distance = math.sqrt((x2-x1)**2 + (y2-y1)**2) if distance <= R: addr_count += 1 indices.append(idx2) if addr_count > 0: row['Count'] = addr_count df_X = df_X.append(row, ignore_index=True) df.drop(indices, inplace=True) # to exclude the rows in next iteration df_X.shape

def count_items(x1, y1, r, df): def is_outside(x2, y2): return r < math.sqrt((x2-x1)**2 + (y2-y1)**2) df_new = df[df.apply(lambda a: is_outside(a['CoordinateX'], a['CoordinateY']), axis=1)] # new set with distant addresses only return df_new, len(df.index) - len(df_new.index) def get_counted(r, df): df_X = pd.DataFrame() # to fill with counts for idx, row in df.iterrows(): x1, y1 = row['CoordinateX'], row['CoordinateX'] df2 = df.copy() df3, addr_count = count_items(x1, y1, r, df2) # df3 contains now only distant addresses if addr_count > 0: row['Count'] = addr_count df_X = df_X.append(row, ignore_index=True) df = df3.copy() return df_X df_c = df.copy() df_addrX = get_counted(R, df_c)

1条回答

网友

1楼 · 发布于 2024-09-28 21:55:25

您需要的是使用组合函数，您可以找到信息here

正如他们所说，你可以申请：

combinations('ABCD', 2)

为了获得：

AB AC AD BC BD CD

函数的第一个参数是行上的迭代器，第二个参数是2（需要两行的组合）

例如，您可以对行的索引进行迭代，因此可以获得行的成对索引

之后，只要有几条线，就可以映射或迭代结果，以便应用计算距离的函数

您可以将所有这些存储在具有以下列的数据框中：

FirstRowIndex
SecondRowIndex
Distance

然后，只需在两列上做一个groupby()：FirstRowIndex和Distance，为了找到同一区域中的元素，删除不需要的元素并保留需要的索引

最后，选择与原始DataFrame中的那些索引对应的行，例如使用df.loc

相关问题更多 >

编程相关推荐

热门问题

热门文章