Advertisement
Guest User

Untitled

a guest
Apr 23rd, 2019
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.83 KB | None | 0 0
  1. fn segment_words(&mut self, text: &[String], search_mode: bool) -> Option<Vec<SegmentCedar>> {
  2. if search_mode && text.len() == 1 {
  3. return None;
  4. }
  5.  
  6. let mut jumpers: Vec<Jumper> = (0..text.len()).map(|_| Jumper::new()).collect();
  7. let mut tokens: Vec<Option<TokenID>> = vec![None; self.dict.max_token_length as usize];
  8.  
  9. for current in 0..text.len() {
  10. let mut base_distance = 0f32;
  11. if current > 0 {
  12. base_distance = jumpers[current - 1].min_distance;
  13. }
  14.  
  15. let mut _tokens: &Vec<Option<TokenID>> = &mut tokens;
  16. let num_tokens = self.dict.lookup_tokens(
  17. &text[current..min(current + self.dict.max_token_length as usize, text.len())],
  18. &mut _tokens,
  19. );
  20.  
  21. // println!("{:?}, {:?}, {:?}", text[current], num_tokens, tokens);
  22.  
  23. let tokens2: &Vec<Option<TokenID>> = _tokens;
  24. for itoken in 0..num_tokens {
  25. let location =
  26. current + self.num_words(tokens2[itoken].as_ref().unwrap()) as usize - 1;
  27. if !search_mode || current != 0 || location != text.len() - 1 {
  28. jumpers[location].update(
  29. base_distance,
  30. _tokens[itoken].as_ref().unwrap(),
  31. self.get_distance(_tokens[itoken].as_ref().unwrap()),
  32. );
  33. }
  34. }
  35.  
  36. if num_tokens == 0 || self.num_words(_tokens[0].as_ref().unwrap()) > 1 {
  37. let token = TokenID::Raw(text[current].clone());
  38. jumpers[current].update(base_distance, &token, 32f32);
  39. }
  40. }
  41.  
  42. let mut num_seg = 0;
  43.  
  44. let mut index: i32 = (text.len() - 1) as i32;
  45. while index >= 0 {
  46. let location =
  47. index - self.num_words(&jumpers[index as usize].token.unwrap()) as i32 + 1;
  48. num_seg += 1;
  49. index = location - 1;
  50. }
  51.  
  52. let mut output_segments: Vec<SegmentCedar> = vec![SegmentCedar::default(); num_seg];
  53. index = text.len() as i32 - 1;
  54. while index >= 0 {
  55. let location =
  56. index - self.num_words(&jumpers[index as usize].token.unwrap()) as i32 + 1;
  57. num_seg -= 1;
  58. output_segments[num_seg as usize] = SegmentCedar {
  59. start: 0,
  60. end: 0,
  61. tokenid: jumpers[index as usize].token.unwrap().clone(),
  62. };
  63. index = location - 1;
  64. }
  65.  
  66. let mut byte_position = 0;
  67. for iseg in 0..output_segments.len() {
  68. output_segments[iseg].start = byte_position;
  69. byte_position += self.byte_len(&output_segments[iseg].tokenid) as u32;
  70. output_segments[iseg].end = byte_position;
  71. }
  72.  
  73. Some(output_segments)
  74. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement