将文本文件拆分为多个部分，然后搜索这些部分中的关键短语

from bs4 import BeautifulSoup import re import time import random import glob, os import string termz = {'does not exceed' : 0, 'shall not exceed' : 0, 'not exceeding' : 0, 'do not exceed' : 0, 'not to exceed' : 0, 'shall at no time exceed' : 0, 'shall not be less than' : 0, 'not less than' : 0} with open('Q:/hello/place/textfile.txt', 'r') as f: sections = f.read().split('**************************************************') for p in sections[1:]: for eachKey in termz.keys(): if eachKey in p: termz[eachKey] = termz.get(eachKey) + 1 print(termz) #print(len(sections)) #there are thirty sections #should be if code encounters ***** then it resets the counters and just moves on.... #so far only can count the phrases over the entire text file.... #GO BACK TO .SPLIT() # termz = dict.fromkeys(termz,0) #resets the counter

2条回答

网友

1楼 · 编辑于 2024-09-29 23:28:27

你的密码很接近。见以下评论：

termz = {
    'does not exceed': 0,
    'shall not exceed': 0,
    'not exceeding': 0,
    'do not exceed': 0,
    'not to exceed': 0,
    'shall at no time exceed': 0,
    'shall not be less than': 0,
    'not less than': 0
}

with open('Q:/hello/place/textfile.txt', 'r') as f:
    sections = f.read().split('**************************************************')

    # Skip the first section. (I assume this is on purpose?)
    for p in sections[1:]:
        for eachKey in termz:
            if eachKey in p:
                # This is simpler than termz[eachKey] = termz.get(eachKey) + 1
                termz[eachKey] += 1

        # Move this outside of the inner loop
        print(termz)

        # After printing the results for that section, reset the counts
        termz = dict.fromkeys(termz, 0)

编辑

输入和输出示例：

input = '''
Section 1:

This section is ignored.
does not exceed
**************************************************
Section 2:

shall not exceed
not to exceed
**************************************************
Section 3:

not less than'''

termz = {
    'does not exceed': 0,
    'shall not exceed': 0,
    'not exceeding': 0,
    'do not exceed': 0,
    'not to exceed': 0,
    'shall at no time exceed': 0,
    'shall not be less than': 0,
    'not less than': 0
}

sections = input.split('**************************************************')

# Skip the first section. (I assume this is on purpose?)
for p in sections[1:]:
    for eachKey in termz:
        if eachKey in p:
            # This is simpler than termz[eachKey] = termz.get(eachKey) + 1
            termz[eachKey] += 1

    # Move this outside of the inner loop
    print(termz)

    # After printing the results for that section, reset the counts
    termz = dict.fromkeys(termz, 0)

# OUTPUT:
# {'not exceeding': 0, 'shall not exceed': 1, 'not less than': 0, 'shall not be less than': 0, 'shall at no time exceed': 0, 'not to exceed': 1, 'do not exceed': 0, 'does not exceed': 0}
# {'not exceeding': 0, 'shall not exceed': 0, 'not less than': 1, 'shall not be less than': 0, 'shall at no time exceed': 0, 'not to exceed': 0, 'do not exceed': 0, 'does not exceed': 0}

网友
2楼 · 编辑于 2024-09-29 23:28:27

if eachKey in p: termz[eachKey] += 1 # might do it print(termz)

相关问题更多 >

编程相关推荐

热门问题

热门文章