Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package main
- import (
- "bytes"
- "encoding/json"
- "fmt"
- "io"
- "io/ioutil"
- "reflect"
- "gopkg.in/jdkato/prose.v2"
- )
- // ProdigyOutput represents a single entry of Prodigy's JSON Lines output.
- //
- // `LabeledEntity` is a structure defined by prose that specifies where the
- // entities are within the given `Text`.
- type ProdigyOutput struct {
- Text string
- Spans []prose.LabeledEntity
- Answer string
- }
- // ReadProdigy reads our JSON Lines file line-by-line, populating a
- // slice of `ProdigyOutput` structures.
- func ReadProdigy(jsonLines []byte) []ProdigyOutput {
- dec := json.NewDecoder(bytes.NewReader(jsonLines))
- entries := []ProdigyOutput{}
- for {
- ent := ProdigyOutput{}
- err := dec.Decode(&ent)
- if err != nil {
- if err == io.EOF {
- break
- }
- panic(err)
- }
- entries = append(entries, ent)
- }
- return entries
- }
- // Split divides our human-annotated data set into two groups: one for training
- // our model and one for testing it.
- //
- // We're using an 80-20 split here, although you may want to use a different
- // split.
- func Split(data []ProdigyOutput) ([]prose.EntityContext, []ProdigyOutput) {
- cutoff := int(float64(len(data)) * 0.8)
- train, test := []prose.EntityContext{}, []ProdigyOutput{}
- for i, entry := range data {
- if i < cutoff {
- train = append(train, prose.EntityContext{
- Text: entry.Text,
- Spans: entry.Spans,
- Accept: entry.Answer == "accept"})
- } else {
- test = append(test, entry)
- }
- }
- return train, test
- }
- func main() {
- data, err := ioutil.ReadFile("reddit_product.jsonl")
- if err != nil {
- panic(err)
- }
- train, test := Split(ReadProdigy(data))
- // Here, we're training a new model named PRODUCT with the training portion
- // of our annotated data.
- //
- // Depending on your hardware, this should take around 1 - 3 minutes.
- model := prose.ModelFromData("PRODUCT", prose.UsingEntities(train))
- // Now, Let's test our model:
- correct := 0.0
- for _, entry := range test {
- // Create a document without segmentation, which isn't required for NER.
- doc, err := prose.NewDocument(
- entry.Text,
- prose.WithSegmentation(false),
- prose.UsingModel(model))
- if err != nil {
- panic(err)
- }
- ents := doc.Entities()
- if entry.Answer != "accept" && len(ents) == 0 {
- // If we rejected this entity during annotation, prose shouldn't
- // have labeled it.
- correct++
- } else {
- // Otherwise, we need to verify that we found the correct entities.
- expected := []string{}
- for _, span := range entry.Spans {
- expected = append(expected, entry.Text[span.Start:span.End])
- }
- if reflect.DeepEqual(expected, ents) {
- correct++
- }
- }
- }
- fmt.Printf("Correct (%%): %f\n", correct / float64(len(test)))
- }
Add Comment
Please, Sign In to add comment