pyspark后缀替换避免某些单词而不映射到pandas或rdd

import pandas as pd exception_list = ['WOODLAND', 'FISHING', 'LAUGHING'] suffix_list = ['ING', 'AND'] cols = ['input'] data = [ ["CAT DOG FROG WOODLAND FARMLAND LAUGHING UNICORN"], ["BOG FISHING CARTING MISSING AND SOGGY"], ["SEARCHING"], ["FINDING"], ["SING SINGING"] ] df = pd.DataFrame(data, columns=cols) df.head() def strip_sufx_word(word, suffix, exception, min_stem_length=4): for sufx in suffix: if word[-len(sufx):] == sufx: if len(word[:-len(sufx)])>=min_stem_length: if word not in exception: word = word[:-len(sufx)] return word def strip_sufx_string(phrase, suffix, exception): new_phrase = [strip_sufx_word(word, suffix, exception) for word in phrase.split()] return ' '.join(new_phrase) df['output'] = df['input'].apply(strip_sufx_string, suffix=suffix_list, exception=exception_list) df.head()

1条回答

网友

1楼 · 发布于 2024-09-27 19:32:30

高阶函数在这里很有用：

import pyspark.sql.functions as F

exception_list = ['WOODLAND', 'FISHING', 'LAUGHING']
suffix_list = ['ING', 'AND']
min_stem_length = 4

result = sdf.withColumn(
    'exception_list', 
    F.array(*[F.lit(w) for w in exception_list])
).withColumn(
    'suffix_list', 
    F.array(*[F.lit(w) for w in suffix_list])
).withColumn(
    'output', 
    F.expr(f"""
        concat_ws(' ', 
            transform(
                split(input, ' '), 
                word -> 
                    aggregate(
                        suffix_list, 
                        word, 
                        (acc, s) -> 
                            case when substring(acc, -length(s)) = s 
                                 and length(substring(acc, 1, length(acc)-length(s))) >= {min_stem_length} 
                                 and not array_contains(exception_list, acc) 
                                 then substring(acc, 1, length(acc)-length(s)) 
                                 else acc 
                            end
                     )
            )
        )
    """
    )
).drop('exception_list', 'suffix_list')

result.show(truncate=False)
+                       -+                      +
|input                                          |output                                      |
+                       -+                      +
|CAT DOG FROG WOODLAND FARMLAND LAUGHING UNICORN|CAT DOG FROG WOODLAND FARML LAUGHING UNICORN|
|BOG FISHING CARTING MISSING AND SOGGY          |BOG FISHING CART MISS AND SOGGY             |
|SEARCHING                                      |SEARCH                                      |
|FINDING                                        |FIND                                        |
|SING SINGING                                   |SING SING                                   |
+                       -+                      +

相关问题更多 >

编程相关推荐

热门问题

热门文章