作为完成此任务的替代方法:IN属性中包含多词条的模式
我编写了以下代码来匹配短语,对其进行标记,然后在EntityRuler
模式中使用它们:
# %%
import spacy
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
class PhraseRuler(object):
name = 'phrase_ruler'
def __init__(self, nlp, terms, label):
patterns = [nlp(term) for term in terms]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add(label, None, *patterns)
def __call__(self, doc):
matches = self.matcher(doc)
spans = []
for label, start, end in matches:
span = Span(doc, start, end, label=label)
spans.append(span)
doc.ents = spans
return doc
nlp = spacy.load("en_core_web_lg")
entity_matcher = PhraseRuler(nlp, ["Best Wishes", "Warm Welcome"], "GREETING")
nlp.add_pipe(entity_matcher, before="ner")
ruler = EntityRuler(nlp)
patterns = [{"label": "SUPER_GREETING", "pattern": [{"LOWER": "super"}, {"ENT_TYPE": "GREETING"}]}]
ruler.add_patterns(patterns)
#ruler.to_disk("./data/patterns.jsonl")
nlp.add_pipe(ruler)
print(nlp.pipe_names)
doc = nlp("Mary said Best Wishes and I said super Warm Welcome.")
print(doc.to_json())
不幸的是,这不起作用,因为它不返回my SUPER_GREETING
:
'ents': [
{'start': 0, 'end': 4, 'label': 'PERSON'},
{'start': 10, 'end': 21, 'label': 'GREETING'},
{'start': 39, 'end': 51, 'label': 'GREETING'}
]
我究竟做错了什么?我如何解决它?
您有正确的想法,但是这里的问题是spaCy的固有设计选择,即任何令牌只能是一个命名实体的一部分。因此,不能将“ Warm Welcome”既作为“ GREETING”,又作为“ SUPER_GREETING”的一部分。
解决此问题的一种方法是使用自定义扩展名。例如,一种解决方案是将GREETING位存储在令牌级别:
Token.set_extension("mylabel", default="")
然后我们调整PhraseRuler.__call__
,使其不会写入,doc.ents
而是这样做:
for token in span:
token._.mylabel = "MY_GREETING"
现在,我们可以将SUPER_GREETING模式重写为:
patterns = [{"label": "SUPER_GREETING", "pattern": [{"LOWER": "super"}, {"_": {"mylabel": "MY_GREETING"}, "OP": "+"}]}]
它将与“ super”相匹配,后跟一个或多个“ MY_GREETING”令牌。它将贪婪地匹配并输出“超级热烈欢迎”作为命中。
这是从您的代码开始并按照说明进行调整的结果代码片段:
Token.set_extension("mylabel", default="")
class PhraseRuler(object):
name = 'phrase_ruler'
def __init__(self, nlp, terms, label):
patterns = [nlp(term) for term in terms]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add(label, None, *patterns)
def __call__(self, doc):
matches = self.matcher(doc)
for label, start, end in matches:
span = Span(doc, start, end, label=label)
for token in span:
token._.mylabel = "MY_GREETING"
return doc
nlp = spacy.load("en_core_web_lg")
entity_matcher = PhraseRuler(nlp, ["Best Wishes", "Warm Welcome"], "GREETING")
nlp.add_pipe(entity_matcher, name="entity_matcher", before="ner")
ruler = EntityRuler(nlp)
patterns = [{"label": "SUPER_GREETING", "pattern": [{"LOWER": "super"}, {"_": {"mylabel": "MY_GREETING"}, "OP": "+"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, after="entity_matcher")
print(nlp.pipe_names)
doc = nlp("Mary said Best Wishes and I said super Warm Welcome.")
print("TOKENS:")
for token in doc:
print(token.text, token._.mylabel)
print()
print("ENTITIES:")
for ent in doc.ents:
print(ent.text, ent.label_)
哪个输出
TOKENS:
Mary
said
Best MY_GREETING
Wishes MY_GREETING
and
I
said
super
Warm MY_GREETING
Welcome MY_GREETING
.
ENTITIES:
Mary PERSON
super Warm Welcome SUPER_GREETING
这可能不完全是您需要/想要的-但我希望它可以帮助您为您的特定用例提供替代解决方案。如果您确实希望普通的“ GREETING”跨度在final中doc.ents
,也许可以在EntityRuler
运行后在后期处理中重新组合它们,例如,通过将自定义属性移动到doc.ents
不重叠的位置,或保留spans
某处。
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句