File size: 1,528 Bytes
91eaff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
`spaCy-entity-linker` demo from
<https://github.com/egerber/spaCy-entity-linker/issues/18>
"""

from icecream import ic  # pylint: disable=E0401
import spacy  # pylint: disable=E0401
import spacy_entity_linker as sel  # pylint: disable=E0401


def link_wikidata (
    doc: spacy.tokens.doc.Doc,
    ) -> None:
    """
Run an entity linking classifier for wikidata
    """
    classifier = sel.EntityClassifier.EntityClassifier()

    for ent in doc.ents:
        print()
        ic(ent.text, ent.label_)

        # build a term (a simple span) then identify all
        # the candidate entities for it
        term: sel.TermCandidate = sel.TermCandidate.TermCandidate(ent)

        candidates: sel.EntityCandidates.EntityCandidates = term.get_entity_candidates()
        ic(candidates)

        if len(candidates) > 0:
            # select the best candidate
            entity: sel.EntityElement.EntityElement = classifier(candidates)

            ic(entity.__dict__)
            ic(entity.get_sub_entities(limit=10))
            ic(entity.get_super_entities(limit=10))


if __name__ == "__main__":
    SRC_TEXT: str = """
Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
After the war, Werner fled to America to become famous.
"""

    # initialize language model
    nlp: spacy.Language = spacy.load("en_core_web_sm")
    sample_doc: spacy.tokens.doc.Doc = nlp(SRC_TEXT.strip())

    link_wikidata(sample_doc)