Guest User

Untitled

a guest
May 27th, 2018
121
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.36 KB | None | 0 0
  1. #!/usr/bin/perl -w
  2. use strict;
  3. use utf8;
  4.  
  5.  
  6. # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  7. my $sPath= print $ARGV[0]; #lingotek file name
  8. my $sSourceLang= print $ARGV[1]; #original language
  9. my $sTargetLang= print $ARGV[2]; #target language
  10. # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  11.  
  12.  
  13. # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  14.  
  15. my $FILEIN;
  16. my $FILEOUT;
  17. my $sLineIn;
  18. my $nLCnt=0; #line counter
  19. my $nTCnt=0; #term entry counter
  20. my $sTermEntryid ="L00";
  21.  
  22. open (FILEIN, "<:encoding(utf8)", $sPath."l480.csv") or die "Can't open INPUT file.\n";
  23. open (FILEOUT, ">:encoding(utf8)", $sPath."tbx.tbx") or die "Can't open OUTPUT file.\n";
  24.  
  25.  
  26. # Creates the header of the TBX file
  27. print FILEOUT "<?xml version='1.0' encoding=\"UTF-8\"?>\n<!DOCTYPE martif SYSTEM \"TBXcoreStructV02.dtd\">\n<martif type=\"TBX\" xml:lang=\"$sSourceLang\">\n";
  28. print FILEOUT "<martifHeader>\n<fileDesc>\n<sourceDesc>\n<p>Made by the Lingotek to TBX converter</p>\n</sourceDesc>\n</fileDesc>\n";
  29. print FILEOUT "<encodingDesc>\n<p type=\"DCSName\">TBXbasicV2.xcs</p>\n<\/encodingDesc>\n<\/martifHeader>\n<text>\n<body>\n\n";
  30.  
  31. while ($sLineIn = <FILEIN>)
  32. {
  33. chomp $sLineIn;
  34. #$nLCnt++;
  35. #if ($nLCnt % 500 == 1) {print $nLCnt.' '; }
  36.  
  37. if ($sLineIn=~ m/([A-Za-z0-9 ]+)\t([A-Za-z0-9 ]+)/)
  38. {
  39. $nTCnt++;
  40. }
  41. $sLineIn =~ s/([A-Za-z0-9 ]+)\t([A-Za-z0-9 ]+)/<termEntry id="$sTermEntryid$nTCnt">\n<langSet xml:lang="$sSourceLang">\n<ntig id="$sTermEntryid$sSourceLang$nTCnt">\n<termGrp>\n<term>$1<\/term>\n<termNoteGrp>\n<termNote type="partOfSpeech"><\/termNote>\n<\/termNoteGrp>\n<\/termGrp>\n<\/ntig>\n<\/langSet>\n<langSet xml:lang="$sTargetLang">\n<ntig id="$sTermEntryid$sTargetLang$nTCnt">\n<termGrp>\n<term>$2<\/term>\n<termNoteGrp>\n<termNote type="partOfSpeech"><\/termNote>\n<\/termNoteGrp>\n<\/termGrp>\n<\/ntig><\/langSet>\n<\/termEntry>/g; # Target then Destination
  42.  
  43. $sLineIn =~ s/<termNote type="partOfSpeech"><\/termNote>\n//g; #Cut out empty tagging in tbx file
  44. $sLineIn =~ s/<termNoteGrp>\n<\/termNoteGrp>\n//g; #Cut out empty tagging in tbx file
  45.  
  46. print FILEOUT $sLineIn."\n";
  47. }
  48. print FILEOUT "</body>\n</text>\n</martif>";
  49. print $nLCnt.' FINISHED:=';
Add Comment
Please, Sign In to add comment