Guest User

Untitled

a guest
Jul 15th, 2018
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.06 KB | None | 0 0
  1. package main
  2.  
  3. import (
  4. "bytes"
  5. "encoding/json"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "reflect"
  10.  
  11. "gopkg.in/jdkato/prose.v2"
  12. )
  13.  
  14.  
  15. // ProdigyOutput represents a single entry of Prodigy's JSON Lines output.
  16. //
  17. // `LabeledEntity` is a structure defined by prose that specifies where the
  18. // entities are within the given `Text`.
  19. type ProdigyOutput struct {
  20. Text string
  21. Spans []prose.LabeledEntity
  22. Answer string
  23. }
  24.  
  25. // ReadProdigy reads our JSON Lines file line-by-line, populating a
  26. // slice of `ProdigyOutput` structures.
  27. func ReadProdigy(jsonLines []byte) []ProdigyOutput {
  28. dec := json.NewDecoder(bytes.NewReader(jsonLines))
  29. entries := []ProdigyOutput{}
  30. for {
  31. ent := ProdigyOutput{}
  32. err := dec.Decode(&ent)
  33. if err != nil {
  34. if err == io.EOF {
  35. break
  36. }
  37. panic(err)
  38. }
  39. entries = append(entries, ent)
  40. }
  41. return entries
  42. }
  43.  
  44. // Split divides our human-annotated data set into two groups: one for training
  45. // our model and one for testing it.
  46. //
  47. // We're using an 80-20 split here, although you may want to use a different
  48. // split.
  49. func Split(data []ProdigyOutput) ([]prose.EntityContext, []ProdigyOutput) {
  50. cutoff := int(float64(len(data)) * 0.8)
  51.  
  52. train, test := []prose.EntityContext{}, []ProdigyOutput{}
  53. for i, entry := range data {
  54. if i < cutoff {
  55. train = append(train, prose.EntityContext{
  56. Text: entry.Text,
  57. Spans: entry.Spans,
  58. Accept: entry.Answer == "accept"})
  59. } else {
  60. test = append(test, entry)
  61. }
  62. }
  63.  
  64. return train, test
  65. }
  66.  
  67. func main() {
  68. data, err := ioutil.ReadFile("reddit_product.jsonl")
  69. if err != nil {
  70. panic(err)
  71. }
  72. train, test := Split(ReadProdigy(data))
  73.  
  74. // Here, we're training a new model named PRODUCT with the training portion
  75. // of our annotated data.
  76. //
  77. // Depending on your hardware, this should take around 1 - 3 minutes.
  78. model := prose.ModelFromData("PRODUCT", prose.UsingEntities(train))
  79.  
  80. // Now, Let's test our model:
  81. correct := 0.0
  82. for _, entry := range test {
  83. // Create a document without segmentation, which isn't required for NER.
  84. doc, err := prose.NewDocument(
  85. entry.Text,
  86. prose.WithSegmentation(false),
  87. prose.UsingModel(model))
  88.  
  89. if err != nil {
  90. panic(err)
  91. }
  92. ents := doc.Entities()
  93.  
  94. if entry.Answer != "accept" && len(ents) == 0 {
  95. // If we rejected this entity during annotation, prose shouldn't
  96. // have labeled it.
  97. correct++
  98. } else {
  99. // Otherwise, we need to verify that we found the correct entities.
  100. expected := []string{}
  101. for _, span := range entry.Spans {
  102. expected = append(expected, entry.Text[span.Start:span.End])
  103. }
  104. if reflect.DeepEqual(expected, ents) {
  105. correct++
  106. }
  107. }
  108. }
  109. fmt.Printf("Correct (%%): %f\n", correct / float64(len(test)))
  110. }
Add Comment
Please, Sign In to add comment