删除句子并更新索引

data = [{"content":'''Hello I am Aniyya. I enjoy playing Football. I love eating grapes''',"annotations":[{"id":1,"start":11,"end":17,"tag":"name"}, {"id":2,"start":59,"end":65,"tag":"fruit"}]}]

result_data = data = [{"content":'''Hello I am Aniyya. I love eating grapes''',"annotations":[{"id":1,"start":11,"end":17,"tag":"name"}, {"id":2,"start":33,"end":39,"tag":"fruit"}]}]

2条回答

网友

1楼 · 编辑于 2024-10-03 00:18:29

import re

data = [{"content":'''Hello I am Aniyya. I enjoy playing Football.
I love eating grapes. Aniyya is great.''',"annotations":[{"id":1,"start":11,"end":17,"tag":"name"},
                                {"id":2,"start":59,"end":65,"tag":"fruit"},
                                {"id":3,"start":67,"end":73,"tag":"name"}]}]
         
         
         
for idx, each in enumerate(data[0]['annotations']):
    start = each['start']
    end = each['end']
    word = data[0]['content'][start:end]
    data[0]['annotations'][idx]['word'] = word
    
sentences = [ {'sentence':x.strip() + '.','checked':False} for x in data[0]['content'].split('.')]

new_data = [{'content':'', 'annotations':[]}]
for idx, each in enumerate(data[0]['annotations']):
    for idx_alpha, sentence in enumerate(sentences):
        if sentence['checked'] == True:
            continue
        temp = each.copy()
        check_word = temp['word']
        if check_word in sentence['sentence']:
            start_idx = re.search(r'\b({})\b'.format(check_word), sentence['sentence']).start()
            end_idx = start_idx + len(check_word)
            
            current_len = len(new_data[0]['content'])
            
            new_data[0]['content'] += sentence['sentence'] + ' '
            temp.update({'start':start_idx + current_len, 'end':end_idx + current_len})
            new_data[0]['annotations'].append(temp)
            
            sentences[idx_alpha]['checked'] = True
            break

输出：

print(new_data)
[{'content': 'Hello I am Aniyya. I love eating grapes. Aniyya is great. ', 'annotations': [{'id': 1, 'start': 11, 'end': 17, 'tag': 'name', 'word': 'Aniyya'}, {'id': 2, 'start': 33, 'end': 39, 'tag': 'fruit', 'word': 'grapes'}, {'id': 3, 'start': 41, 'end': 47, 'tag': 'name', 'word': 'Aniyya'}]}]

网友

2楼 · 编辑于 2024-10-03 00:18:29

从我在问题中看到的是，有一个分隔符来分隔一个名为“.”（点）的句子。这样，你可以把句子分成不同的单元，然后对每个句子，你可以试着检查它是否是一个有注释的有效句子，或者从字符串中删除或拼接该句子

我已经为同样的问题写了一份解决方案草案，这就是完成工作。请随时提出任何更改。此外，你可能需要调整它，以满足你的确切需求

data = [{"content":'''Hello I am Aniyya. I enjoy playing Football.I love eating grapes''',"annotations":[{"id":1,"start":11,"end":17,"tag":"name"},                {"id":2,"start":59,"end":65,"tag":"fruit"}]}]
identifier = '#'

def processRow(row):
    annotations = row["annotations"]
    temp = row["content"]
    startIndex = 0;
    endIndex = 0;
    annotationMap = dict()
    for annotation in annotations:
        start = annotation["start"]
        end = annotation["end"] - 1
        temp = temp[:end] + identifier + temp[end+1:]
        
    result = ""
    temp = temp.split(".")
    content = row["content"].split(".")
    
    for tempRow,row in zip(temp,content):
        if identifier in tempRow:
            result = result + row + "."
            
    return result

def processData(data):
    for row in data:
        temp = processRow(row)
        row["content"] = temp
    print(data)
    
    
processData(data)

相关问题更多 >

编程相关推荐

热门问题

热门文章