Advertisement
Guest User

Untitled

a guest
Feb 25th, 2017
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
MatLab 1.82 KB | None | 0 0
  1.  
  2. function LM = lm_train(dataDir, language, fn_LM)
  3. %
  4. %  lm_train
  5. %
  6. %  This function reads data from dataDir, computes unigram and bigram counts,
  7. %  and writes the result to fn_LM
  8. %
  9. %  INPUTS:
  10. %
  11. %       dataDir     : (directory name) The top-level directory containing
  12. %                                      data from which to train or decode
  13. %                                      e.g., '/u/cs401/A2_SMT/data/Toy/'
  14. %       language    : (string) either 'e' for English or 'f' for French
  15. %       fn_LM       : (filename) the location to save the language model,
  16. %                                once trained
  17. %  OUTPUT:
  18. %
  19. %       LM          : (variable) a specialized language model structure
  20. %
  21. %  The file fn_LM must contain the data structure called 'LM',
  22. %  which is a structure having two fields: 'uni' and 'bi', each of which holds
  23. %  sub-structures which incorporate unigram or bigram COUNTS,
  24. %
  25. %       e.g., LM.uni.word = 5       % the word 'word' appears 5 times
  26. %             LM.bi.word.bird = 2   % the bigram 'word bird' appears twice
  27. %
  28. % Template (c) 2011 Frank Rudzicz
  29.  
  30. global CSC401_A2_DEFNS
  31.  
  32. LM=struct();
  33. LM.uni = struct();
  34. LM.bi = struct();
  35.  
  36. SENTSTARTMARK = 'SENTSTART';
  37. SENTENDMARK = 'SENTEND';
  38.  
  39. DD = dir( [ dataDir, filesep, '*', language] );
  40.  
  41. disp([ dataDir, filesep, '.*', language] );
  42.  
  43. for iFile=1:length(DD)
  44.  
  45.   lines = textread([dataDir, filesep, DD(iFile).name], '%s','delimiter','\n');
  46.  
  47.   for l=1:length(lines)
  48.  
  49.     processedLine =  preprocess(lines{l}, language);
  50.     words = strsplit(' ', processedLine );
  51.     for iWords=2:length(words)-1
  52.       word=words(iWords);
  53.       disp( word );
  54.       LM.uni.(word)=1;
  55.       disp( word );
  56.     end
  57.  
  58.     % TODO: THE STUDENT IMPLEMENTS THE FOLLOWING
  59.  
  60.     % TODO: THE STUDENT IMPLEMENTED THE PRECEDING
  61.   end
  62. end
  63.  
  64. save( fn_LM, 'LM', '-mat');
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement