python2和python3中的python unicode问题

with gzip.open(externallinks_file, 'r') as single_externallinksfile: #reader = codecs.getreader("utf-8") #single_externallinksfile = reader(single_externallinksfile) #with codecs.getreader('utf-8')gzip.open(externallinks_file, 'r') as single_externallinksfile: linecounter = 0 totlinecounter = 0 filelinecounter = 0 # We need to read line by line as we have massive files, sometimes multiple GBs for line in single_externallinksfile: if sys.version_info<(3,0,0): line = unicode(line, 'utf-8') else: line = line.decode("utf-8") if "INSERT INTO" in line: insert_statements = line.split("),(") for statement in insert_statements: #statement = statement.decode("utf-8") filelinecounter += 1 #if ("geohack.php?" in statement) and (("pagename" in statement) or ("src=" in statement)): # src can also be in the line, but is different and we leave it out for now if ("geohack.php?" in statement) and ("pagename" in statement) and ("params" in statement): language = "" region = "" poitype = "" content = re.findall(r'.*?pagename=(.*?)\'\,\'',statement,flags=re.IGNORECASE) if len(content) > 0: # We even need this check due to corrupted lines splitcontent = content[0].split("&") title = splitcontent[0] #title = title.decode('utf8') for subcontent in splitcontent: if "language=" in subcontent: language = subcontent.replace("language=","") #print('taal is: ' + language) if "params=" in subcontent: params_string = subcontent.replace("params=","").split("_") latitude,longitude,poitype,region = get_coordinates_type_region(params_string) if ( str(latitude) != "" and str(longitude) != "" and (str(latitude) != "0") or (str(longitude) != "0")): if GENERATE_SQL == "YES": sql_file.write('insert into ' + file_prefix + '_externallinks values ("' + title + '","' + str(latitude) + '","' + str(longitude) + '","' + language + '","' + poitype + '","' + region + '");\n') if CREATE_SQLITE == "YES": sqlcommand = 'insert into ' + file_prefix + '_externallinks values ("' + title + '","' + str(latitude) + '","' + str(longitude) + '","' + language + '","' + poitype + '","' + region +'");' #print(sqlcommand) cursor.execute(sqlcommand) linecounter += 1 if linecounter == 10000: if CREATE_SQLITE == "YES": # Do a databse commit every 10000 rows wikidb.commit() totlinecounter += linecounter linecounter = 0 print('\nProcessed ' + str(totlinecounter) + ' lines out of ' + str(filelinecounter) + ' sql line statements. Elapsed time: ' + str(datetime.datetime.now().replace(microsecond=0) - start_time))

1条回答

网友

1楼 · 发布于 2024-10-02 02:34:23

看起来标题是percent-encoded。你知道吗

try:
    # Python 3
    from urllib.parse import unquote
except ImportError:
    # Python 2
    from urllib import unquote

percent_encoded = '''
%D9%85%D8%A7%D9%81%D8%B8%D8%A9_%D8%A7%D9%84%D8%A8%D8%AF%D8%A7%D8%A6%D8%B9
%D8%A3%D9%88%D8%B1%D9%8A%D9%88%D9%8A%D9%84%D8%A7
Battle_of_Nicopolis
Qingdao
'''
print(unquote(percent_encoded))

收益率

مافظة_البدائع
أوريويلا
Battle_of_Nicopolis
Qingdao

相关问题更多 >

编程相关推荐

热门问题

热门文章