<p>试试这个,让我知道。由于访问结构(如列表和dict)比pandas数据帧更快,并且有效项的快速初步选择(不使用库re),因此这应该更具时间效率</p>
<pre><code># necessary imports
import pandas as pd
import itertools
import re
# test dataframes
df1 = pd.DataFrame({
'id': ['00001','00261','00002'],
'name': ['angiocarcoma', 'shrimp allergy', 'fish allergy']
})
df = pd.DataFrame({
'Entry_name': ['TRGV2','TRGJ1','TRGJ2'],
'CA': ['3BHS1 HSD3B1 3BH HSDB3', '3BP1 SH3BP1 IF', '3BP0']
})
# redesign data structures you work with
# set() will deduplicate for you
disease_list = list(set(df1['name']))
CA_list = list(set(df['CA']))
valid_CA_list_tmp = list(itertools.chain(*[x.split() for x in CA_list]))
valid_CA_list = [x for x in valid_CA_list_tmp if len(x)>2]
# the function
def disease_search_v2(nltk_tokens_sen):
"""Takes string as input"""
found_diseases_preliminary = [x for x in disease_list if x.lower() in nltk_tokens_sen.lower()]
found_CA_preliminary = [x for x in valid_CA_list if x.lower() in nltk_tokens_sen.lower()]
found_diseases = [x for x in found_diseases_preliminary if re.search(rf"\b{x}\b", nltk_tokens_sen)]
found_CA = [x for x in found_CA_preliminary if re.search(rf"\b{x}\b", nltk_tokens_sen)]
if len(found_diseases) > 0 and len(found_CA) > 0:
return {x:found_CA for x in found_diseases}
else:
return {}
# testing cases
disease_search_v2('very hard angiocarcoma diagnosed 3BHS1')
disease_search_v2('very hard angiocarcoma diagnosed IF')
disease_search_v2('very hard angiocarcoma diagnosed 3BP0')
disease_search_v2('very hard angiocarcoma diagnosed 3BP0 3BHS1')
disease_search_v2('fish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1')
disease_search_v2('fish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1\nfish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1\n')
</code></pre>