Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- fn segment_words(&mut self, text: &[String], search_mode: bool) -> Option<Vec<SegmentCedar>> {
- if search_mode && text.len() == 1 {
- return None;
- }
- let mut jumpers: Vec<Jumper> = (0..text.len()).map(|_| Jumper::new()).collect();
- let mut tokens: Vec<Option<TokenID>> = vec![None; self.dict.max_token_length as usize];
- for current in 0..text.len() {
- let mut base_distance = 0f32;
- if current > 0 {
- base_distance = jumpers[current - 1].min_distance;
- }
- let mut _tokens: &Vec<Option<TokenID>> = &mut tokens;
- let num_tokens = self.dict.lookup_tokens(
- &text[current..min(current + self.dict.max_token_length as usize, text.len())],
- &mut _tokens,
- );
- // println!("{:?}, {:?}, {:?}", text[current], num_tokens, tokens);
- let tokens2: &Vec<Option<TokenID>> = _tokens;
- for itoken in 0..num_tokens {
- let location =
- current + self.num_words(tokens2[itoken].as_ref().unwrap()) as usize - 1;
- if !search_mode || current != 0 || location != text.len() - 1 {
- jumpers[location].update(
- base_distance,
- _tokens[itoken].as_ref().unwrap(),
- self.get_distance(_tokens[itoken].as_ref().unwrap()),
- );
- }
- }
- if num_tokens == 0 || self.num_words(_tokens[0].as_ref().unwrap()) > 1 {
- let token = TokenID::Raw(text[current].clone());
- jumpers[current].update(base_distance, &token, 32f32);
- }
- }
- let mut num_seg = 0;
- let mut index: i32 = (text.len() - 1) as i32;
- while index >= 0 {
- let location =
- index - self.num_words(&jumpers[index as usize].token.unwrap()) as i32 + 1;
- num_seg += 1;
- index = location - 1;
- }
- let mut output_segments: Vec<SegmentCedar> = vec![SegmentCedar::default(); num_seg];
- index = text.len() as i32 - 1;
- while index >= 0 {
- let location =
- index - self.num_words(&jumpers[index as usize].token.unwrap()) as i32 + 1;
- num_seg -= 1;
- output_segments[num_seg as usize] = SegmentCedar {
- start: 0,
- end: 0,
- tokenid: jumpers[index as usize].token.unwrap().clone(),
- };
- index = location - 1;
- }
- let mut byte_position = 0;
- for iseg in 0..output_segments.len() {
- output_segments[iseg].start = byte_position;
- byte_position += self.byte_len(&output_segments[iseg].tokenid) as u32;
- output_segments[iseg].end = byte_position;
- }
- Some(output_segments)
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement