6 GB RAM无法使用Word2Vec对文本进行矢量化

# --------------- calculating word weight for using later in word2vec model & bringing words together --------------- def word_weight(data): vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) d = dict() for index in tqdm(data, total=len(data), desc='Assigning weight to words'): # --------- try except caches the empty indexes ---------- try: matrix = vectorizer.fit_transform([w for w in index]) tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) d.update(tfidf) except ValueError: continue print("every word has weight now\n" "--------------------------------------") return d # ------------------- bringing tokens with weight to recreate tweets ---------------- def tweet2vec(tokens, size, tfidf): count = 0 for index in tqdm(tokens, total=len(tokens), desc='creating sentence vectors'): # ---------- size is the dimension of word2vec model (200) --------------- vec = np.zeros(size) for word in index: try: vec += model[word] * tfidf[word] except KeyError: continue tokens[count] = vec.tolist() count += 1 print("tweet vectors are ready for scaling for ML algorithm\n" "-------------------------------------------------") return tokens dataset = read_dataset('training.csv', ['target', 't_id', 'created_at', 'query', 'user', 'text']) dataset = delete_unwanted_col(dataset, ['t_id', 'created_at', 'query', 'user']) dataset_token = [pre_process(t) for t in tqdm(map(lambda t: t, dataset['text']), desc='cleaning text', total=len(dataset['text']))] print('pre_process completed, list of tweet tokens is returned\n' '--------------------------------------------------------') X = np.array(tweet2vec(dataset_token, 200, word_weight(dataset_token))) print('scaling vectors ...') X_scaled = scale(X) print('features scaled!')

2条回答

网友

1楼 · 编辑于 2024-06-28 20:15:22

如果我理解正确的话，它可以处理100万条tweets，但不能处理160万条tweets？所以你知道代码是正确的

如果GPU在您认为不应该的情况下内存不足，那么它可能是从上一个进程中保留下来的。使用^{}检查哪些进程正在使用GPU，以及有多少内存。如果（在运行代码之前）您发现python进程中有一大块，那么它可能是一个崩溃的进程，或者Jupyter窗口仍然打开，等等

我发现watch nvidia-smi（不确定是否有windows等价物）很有用，可以查看GPU内存在培训过程中的变化。通常一个块在开始时被保留，然后它保持相当恒定。如果您看到它呈线性上升，那么代码可能有问题（您是否在每次迭代时都重新加载模型，诸如此类？）

网友

2楼 · 编辑于 2024-06-28 20:15:22

当我将代码（tweet2vec函数）更改为此时，我的问题就解决了（w为字重）

def tweet2vec(tokens, size, tfidf):
    #       - size is the dimension of word2vec model (200)        -
    vec = np.zeros(size).reshape(1, size)
    count = 0
    for word in tokens:
        try:
            vec += model[word] * tfidf[word]
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

X = np.concatenate([tweet2vec(token, 200, w) for token in tqdm(map(lambda token: token, dataset_token),
                                                               desc='creating tweet vectors',
                                                               total=len(dataset_token))]

)

我不知道为什么

相关问题更多 >

编程相关推荐

热门问题

热门文章