<p>稍微重构一下代码,使其更具可读性。我用<code>urllib.parse</code>做最后一部分</p>
<pre><code>mport re
import urllib.parse
example = pd.Series(['None', 'http://fakeurl.com/example/fakeurl', 'https://www.qwer.com/example/qwer', 'None', 'test.com/example/test', 'None', '123135123', 'nourlhere', 'lol', 'hello.tv', 'nolink', 'ihavenowebsite.com'])
re1 = r'([-a-zA-Z0-9\u0080-\u024F@:%._\+~#=]{1,256})\.[a-zA-Z0-9()]{1,6}\b'
re2 = r'^((http[s]?|ftp):\/)?\/?([-a-zA-Z0-9\u0080-\u024F@:%._\+~#=]{1,256})\.[a-zA-Z0-9()]{1,6}\b$'
re3 = r'www\.([\w]*)'
def modurl(s):
u = urllib.parse.urlparse(s)
if u.netloc=="" or u.path!="/example":
return s
else:
return f"{s}/{re.findall(re3, urllib.parse.urlparse(s).netloc)[0]}"
example = (example
.map(lambda x: x.replace('https://www.', ''))
.map(lambda x: x.replace('www.', ''))
.map(lambda x: x.replace('https://', ''))
.map(lambda x: x.replace('http://', ''))
.map(lambda x: np.where(bool(re.search(re1, x)), "http://www."+x, x))
.map(lambda x: np.where(bool(re.search(re2, x)), x+"/example", x))
.map(lambda x: modurl(x))
)
print(example.to_string())
</code></pre>
<p><strong>输出</strong></p>
<pre><code>0 None
1 http://www.fakeurl.com/example/fakeurl
2 http://www.qwer.com/example/qwer
3 None
4 http://www.test.com/example/test
5 None
6 123135123
7 nourlhere
8 lol
9 http://www.hello.tv/example/hello
10 nolink
11 http://www.ihavenowebsite.com/example/ihavenow...
</code></pre>