Spacy v3数据格式问题中textcat_多标签模型的训练

def convert_cat_annontation_from_sentinelle_db(output_path, nlp, input_path, cats_empty): #Takes in a csv file resulting from the SQL query qry_export_rdp_annotations.sql db = db = DocBin() # create a DocBin object annotated_data = pd.read_csv(input_path) print(len(annotated_data)) for idx, row in annotated_data.iterrows(): # data in previous format cats = dict(cats_empty) try: #fetch article full text from link article = newspaper.Article(row['link']) article.download() article.parse() doc = nlp.make_doc(article.text) # create doc object from text #Assign 1 to positive match for each category cats_list = row["GROUP_CONCAT(j.name)"].split(',') for cat in cats_list: cats[cat] = 1 doc.cats = cats print(doc) print(cats) print(doc.cats) db.add(doc) except: cats_list = row["GROUP_CONCAT(j.name)"].split(',') continue db.to_disk(output_path) # save the docbin object

ℹ Using CPU =========================== Initializing pipeline =========================== [2021-08-18 06:09:46,242] [INFO] Set up nlp object from config [2021-08-18 06:09:46,259] [INFO] Pipeline: ['tok2vec', 'textcat_multilabel', 'ner', 'parser'] [2021-08-18 06:09:46,266] [INFO] Created vocabulary [2021-08-18 06:09:50,649] [INFO] Added vectors: fr_core_news_lg [2021-08-18 06:09:56,557] [INFO] Finished initializing nlp object [2021-08-18 06:10:07,714] [INFO] Initialized pipeline components: ['tok2vec', 'textcat_multilabel', 'ner', 'parser'] ✔ Initialized pipeline ============================= Training pipeline ============================= ℹ Pipeline: ['tok2vec', 'textcat_multilabel', 'ner', 'parser'] ℹ Initial learn rate: 0.001 E # LOSS TOK2VEC LOSS TEXTC... LOSS NER LOSS PARSER CATS_SCORE ENTS_F ENTS_P ENTS_R DEP_UAS DEP_LAS SENTS_F SCORE --- ------ ------------ ------------- -------- ----------- ---------- ------ ------ ------ ------- ------- ------- ------ Traceback (most recent call last): File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 281, in evaluate scores = nlp.evaluate(dev_corpus(nlp)) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/language.py", line 1389, in evaluate results = scorer.score(examples) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/scorer.py", line 135, in score scores.update(component.score(examples, **self.cfg)) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/pipeline/textcat_multilabel.py", line 179, in score return Scorer.score_cats( File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/scorer.py", line 465, in score_cats auc_per_type[label].score_set(pred_score, gold_score) KeyError: 'Politique fédérale' The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main return _run_code(code, main_globals, None, File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code exec(code, run_globals) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/__main__.py", line 4, in <module> setup_cli() File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/cli/_util.py", line 69, in setup_cli command(prog_name=COMMAND) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 829, in __call__ return self.main(*args, **kwargs) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 782, in main rv = self.invoke(ctx) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 1259, in invoke return _process_result(sub_ctx.command.invoke(sub_ctx)) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 1066, in invoke return ctx.invoke(self.callback, **ctx.params) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/typer/main.py", line 497, in wrapper return callback(**use_params) # type: ignore File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/cli/train.py", line 59, in train_cli train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 122, in train raise e File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 105, in train for batch, info, is_best_checkpoint in training_step_iterator: File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 226, in train_while_improving score, other_scores = evaluate() File "/Users/alex/PycharmProjects/Ecosysteme_NLP_Sentinelle/python39clean/lib/python3.9/site-packages/spacy/training/loop.py", line 283, in evaluate raise KeyError(Errors.E900.format(pipeline=nlp.pipe_names)) from e KeyError: "[E900] Could not run the full pipeline for evaluation. If you specified frozen components, make sure they were already initialized and trained. Full pipeline: ['tok2vec', 'textcat_multilabel', 'ner', 'parser']"

====================== Text Classification (Multilabel) ====================== ℹ Text Classification: 30 label(s) ⚠ Some model labels are not present in the train data. The model performance may be degraded for these labels after training: 'v', 'F', 'm', 'f', 'É', 'l', 'c', 'q', 'o', ']', 'u', 'I', 'P', 'r', 'a', 'D', 'é', 'S', 't', ',', 'M', ' ', 's', ''', 'd', 'i', 'p', 'e', 'n', '['.

1条回答

网友

1楼 · 发布于 2024-06-28 11:29:58

来自doc.cats的示例输出看起来不错，但正如您所注意到的，debug data的输出表明有问题。看起来，当您认为在某处使用列表时，可能使用了字符串。您可能只对一两个文档执行此操作，但这会导致更一般的问题。还请注意，坏类别似乎只存在于您的开发数据中

在脚本中，您可以通过执行类似assert cat in cats_empty的操作来检查坏标签，这样您就不会在运行时意外地添加新类别

相关问题更多 >

编程相关推荐

热门问题

热门文章