使用正则表达式或python函数提取两个字符串的所有相同对之间的所有字符串

2024-05-19 11:04:28 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试使用正则表达式或python函数来提取所有粗体文本,或介于“和<;=”之间的文本

"[Text(447.1153846153846, 471.625, 'the <= 0.5\nentropy = 0.97\nsamples = 100.0%\nvalue = [0.399, 0.601]\nclass = True News'), Text(238.46153846153845, 336.875, 'donald <= 0.5\nentropy = 0.921\nsamples = 83.7%\nvalue = [0.336, 0.664]\nclass = True News'), Text(119.23076923076923, 202.125, 'hillary <= 0.5\nentropy = 0.981\nsamples = 55.6%\nvalue = [0.42, 0.58]\nclass = True News'), Text(59.61538461538461, 67.375, '\n (...) \n'), Text(178.84615384615384, 67.375, '\n (...) \n'), Text(357.6923076923077, 202.125, 'hillary <= 0.5\nentropy = 0.663\nsamples = 28.2%\nvalue = [0.172, 0.828]\nclass = True News'), Text(298.0769230769231, 67.375, '\n (...) \n'), Text(417.30769230769226, 67.375, '\n (...) \n'), Text(655.7692307692307, 336.875, 'trumps <= 0.5\nentropy = 0.859\nsamples = 16.3%\nvalue = [0.718, 0.282]\nclass = Fake News'), Text(596.1538461538462, 202.125, 'hillary <= 0.5\nentropy = 0.821\nsamples = 15.7%\nvalue = [0.744, 0.256]\nclass = Fake News'), Text(536.5384615384615, 67.375, '\n (...) \n'), Text(655.7692307692307, 67.375, '\n (...) \n'), Text(715.3846153846154, 202.125, 'entropy = 0.0\nsamples = 0.6%\nvalue = [0.0, 1.0]\nclass = True News')]"

到目前为止,我得到的最接近的是(?=')(*)(?=<;=),但到目前为止没有运气

有人能告诉我如何在单引号和<;=之间提取这些粗体文本吗

不需要使用正则表达式

谢谢


Tags: the函数text文本lttruefakenews
3条回答

这个正则表达式有效。我们使用命名组,因此很容易引用所需的确切数据。它的设置是查找连续的单词和后跟“<;=”的数字。然后我们使用finditer获得所有匹配项

import re

data = "[Text(447.1153846153846, 471.625, 'the <= 0.5\nentropy = 0.97\nsamples = 100.0%\nvalue = [0.399, 0.601]\nclass = True News'), Text(238.46153846153845, 336.875, 'donald <= 0.5\nentropy = 0.921\nsamples = 83.7%\nvalue = [0.336, 0.664]\nclass = True News'), Text(119.23076923076923, 202.125, 'hillary <= 0.5\nentropy = 0.981\nsamples = 55.6%\nvalue = [0.42, 0.58]\nclass = True News'), Text(59.61538461538461, 67.375, '\n (...) \n'), Text(178.84615384615384, 67.375, '\n (...) \n'), Text(357.6923076923077, 202.125, 'hillary <= 0.5\nentropy = 0.663\nsamples = 28.2%\nvalue = [0.172, 0.828]\nclass = True News'), Text(298.0769230769231, 67.375, '\n (...) \n'), Text(417.30769230769226, 67.375, '\n (...) \n'), Text(655.7692307692307, 336.875, 'trumps <= 0.5\nentropy = 0.859\nsamples = 16.3%\nvalue = [0.718, 0.282]\nclass = Fake News'), Text(596.1538461538462, 202.125, 'hillary <= 0.5\nentropy = 0.821\nsamples = 15.7%\nvalue = [0.744, 0.256]\nclass = Fake News'), Text(536.5384615384615, 67.375, '\n (...) \n'), Text(655.7692307692307, 67.375, '\n (...) \n'), Text(715.3846153846154, 202.125, 'entropy = 0.0\nsamples = 0.6%\nvalue = [0.0, 1.0]\nclass = True News')]"

fmt = re.compile(r'(?P<info>[\w\d]+) <=', re.I)
for m in fmt.finditer(data):
    print(m.group('info'))

如果您只想看整个9码,下面将把整个内容解析成一个命名元组,它主要反映文本的格式。我不知道前两个值代表什么,所以我只称它们为xy。我这么做是因为你想要的似乎不是很有用,我认为这个问题只是最终确定更多数据的前兆。这将精确定位所有数据。任何带有\n (...) \n数据的条目都被打印为“空”,并且不存储在条目list

import re
from collections import namedtuple

data    = "[Text(447.1153846153846, 471.625, 'the <= 0.5\nentropy = 0.97\nsamples = 100.0%\nvalue = [0.399, 0.601]\nclass = True News'), Text(238.46153846153845, 336.875, 'donald <= 0.5\nentropy = 0.921\nsamples = 83.7%\nvalue = [0.336, 0.664]\nclass = True News'), Text(119.23076923076923, 202.125, 'hillary <= 0.5\nentropy = 0.981\nsamples = 55.6%\nvalue = [0.42, 0.58]\nclass = True News'), Text(59.61538461538461, 67.375, '\n (...) \n'), Text(178.84615384615384, 67.375, '\n (...) \n'), Text(357.6923076923077, 202.125, 'hillary <= 0.5\nentropy = 0.663\nsamples = 28.2%\nvalue = [0.172, 0.828]\nclass = True News'), Text(298.0769230769231, 67.375, '\n (...) \n'), Text(417.30769230769226, 67.375, '\n (...) \n'), Text(655.7692307692307, 336.875, 'trumps <= 0.5\nentropy = 0.859\nsamples = 16.3%\nvalue = [0.718, 0.282]\nclass = Fake News'), Text(596.1538461538462, 202.125, 'hillary <= 0.5\nentropy = 0.821\nsamples = 15.7%\nvalue = [0.744, 0.256]\nclass = Fake News'), Text(536.5384615384615, 67.375, '\n (...) \n'), Text(655.7692307692307, 67.375, '\n (...) \n'), Text(715.3846153846154, 202.125, 'entropy = 0.0\nsamples = 0.6%\nvalue = [0.0, 1.0]\nclass = True News')]"

#regex to describe the overall entry
entfmt  = re.compile(r'Text\((?P<x>([\d\.]+)), (?P<y>([\d\.]+)), \'(?P<data>([^\']+))\'\)', re.I|re.S)

#format all of the float groups ~ 
#  flt is a repeatable chunk so we create this part of the expression in a loop
#  all this really does is make the final datfmt regex seem shorter
flt     = '{}(?P<{}>([\d\.]+))'
args    = ('_fval', '\nentropy = _ent', '\nsamples = _samp', '%\nvalue = \[_lval', ', _rval')
fltreg  = ''.join([flt.format(a, b) for (a, b) in [arg.split('_') for arg in args]])

#regex to describe the data portion of an entry
datfmt  = re.compile('(?P<focus>([\w\d]+)) <= {}\]\nclass = (?P<class>(.+))'.format(fltreg), re.I|re.S)

#container for individual entries
entries = []

#entry descriptor
Entry   = namedtuple('Entry', 'x y focus fvalue entropy samples value cls')

#for storing entry index
c = 0

#find all entries
for m in entfmt.finditer(data):
    #consistent entry data
    x, y = float(m.group('x')), float(m.group('y'))
    #get all data for this entry
    m2 = datfmt.match(m.group('data'))
    #make sure this was not an empty entry
    if m2:
        #append entry
        entries.append(Entry(x, y,
                             m2.group('focus'), 
                             float(m2.group('fval')), 
                             float(m2.group('ent')), 
                             float(m2.group('samp')), 
                             [float(m2.group('lval')), float(m2.group('rval'))], 
                             m2.group('class')))
    else:
        #entry has empty data
        print('Data[{}] with [x:{}, y:{}] is empty'.format(c, x, y))
        
    #increment entry index
    c += 1
        
#print all entries
print(*entries, sep='\n')

#Entry(x=447.1153846153846 , y=471.625, focus='the'    , fvalue=0.5, entropy=0.97 , samples=100.0, value=[0.399, 0.601], cls='True News')
#Entry(x=238.46153846153845, y=336.875, focus='donald' , fvalue=0.5, entropy=0.921, samples=83.7 , value=[0.336, 0.664], cls='True News')
#Entry(x=119.23076923076923, y=202.125, focus='hillary', fvalue=0.5, entropy=0.981, samples=55.6 , value=[0.42 , 0.58 ], cls='True News')
#Entry(x=357.6923076923077 , y=202.125, focus='hillary', fvalue=0.5, entropy=0.663, samples=28.2 , value=[0.172, 0.828], cls='True News')
#Entry(x=655.7692307692307 , y=336.875, focus='trumps' , fvalue=0.5, entropy=0.859, samples=16.3 , value=[0.718, 0.282], cls='Fake News')
#Entry(x=596.1538461538462 , y=202.125, focus='hillary', fvalue=0.5, entropy=0.821, samples=15.7 , value=[0.744, 0.256], cls='Fake News')

使用单引号'的后向查找和<=
中间的非引号字符可以与内容匹配

r"(?<=')[^']*?(?=\s*<=)"  

https://regex101.com/r/KlYLQ2/1

一种方法:

import re

text = "[Text(447.1153846153846, 471.625, 'the <= 0.5\nentropy = 0.97\nsamples = 100.0%\nvalue = [0.399, " \
       "0.601]\nclass = True News'), Text(238.46153846153845, 336.875, 'donald <= 0.5\nentropy = 0.921\nsamples = " \
       "83.7%\nvalue = [0.336, 0.664]\nclass = True News'), Text(119.23076923076923, 202.125, 'hillary <= " \
       "0.5\nentropy = 0.981\nsamples = 55.6%\nvalue = [0.42, 0.58]\nclass = True News'), Text(59.61538461538461, " \
       "67.375, '\n (...) \n'), Text(178.84615384615384, 67.375, '\n (...) \n'), Text(357.6923076923077, 202.125, " \
       "'hillary <= 0.5\nentropy = 0.663\nsamples = 28.2%\nvalue = [0.172, 0.828]\nclass = True News'), " \
       "Text(298.0769230769231, 67.375, '\n (...) \n'), Text(417.30769230769226, 67.375, '\n (...) \n'), " \
       "Text(655.7692307692307, 336.875, 'trumps <= 0.5\nentropy = 0.859\nsamples = 16.3%\nvalue = [0.718, " \
       "0.282]\nclass = Fake News'), Text(596.1538461538462, 202.125, 'hillary <= 0.5\nentropy = 0.821\nsamples = " \
       "15.7%\nvalue = [0.744, 0.256]\nclass = Fake News'), Text(536.5384615384615, 67.375, '\n (...) \n'), " \
       "Text(655.7692307692307, 67.375, '\n (...) \n'), Text(715.3846153846154, 202.125, 'entropy = 0.0\nsamples = " \
       "0.6%\nvalue = [0.0, 1.0]\nclass = True News')] "

for match in re.finditer(", '(.*)?<=", text):
    print(match.group(1))

输出

the 
donald 
hillary 
hillary 
trumps 
hillary 

相关问题 更多 >

    热门问题