Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w
- use strict;
- use utf8;
- # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- my $sPath= print $ARGV[0]; #lingotek file name
- my $sSourceLang= print $ARGV[1]; #original language
- my $sTargetLang= print $ARGV[2]; #target language
- # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- my $FILEIN;
- my $FILEOUT;
- my $sLineIn;
- my $nLCnt=0; #line counter
- my $nTCnt=0; #term entry counter
- my $sTermEntryid ="L00";
- open (FILEIN, "<:encoding(utf8)", $sPath."l480.csv") or die "Can't open INPUT file.\n";
- open (FILEOUT, ">:encoding(utf8)", $sPath."tbx.tbx") or die "Can't open OUTPUT file.\n";
- # Creates the header of the TBX file
- print FILEOUT "<?xml version='1.0' encoding=\"UTF-8\"?>\n<!DOCTYPE martif SYSTEM \"TBXcoreStructV02.dtd\">\n<martif type=\"TBX\" xml:lang=\"$sSourceLang\">\n";
- print FILEOUT "<martifHeader>\n<fileDesc>\n<sourceDesc>\n<p>Made by the Lingotek to TBX converter</p>\n</sourceDesc>\n</fileDesc>\n";
- print FILEOUT "<encodingDesc>\n<p type=\"DCSName\">TBXbasicV2.xcs</p>\n<\/encodingDesc>\n<\/martifHeader>\n<text>\n<body>\n\n";
- while ($sLineIn = <FILEIN>)
- {
- chomp $sLineIn;
- #$nLCnt++;
- #if ($nLCnt % 500 == 1) {print $nLCnt.' '; }
- if ($sLineIn=~ m/([A-Za-z0-9 ]+)\t([A-Za-z0-9 ]+)/)
- {
- $nTCnt++;
- }
- $sLineIn =~ s/([A-Za-z0-9 ]+)\t([A-Za-z0-9 ]+)/<termEntry id="$sTermEntryid$nTCnt">\n<langSet xml:lang="$sSourceLang">\n<ntig id="$sTermEntryid$sSourceLang$nTCnt">\n<termGrp>\n<term>$1<\/term>\n<termNoteGrp>\n<termNote type="partOfSpeech"><\/termNote>\n<\/termNoteGrp>\n<\/termGrp>\n<\/ntig>\n<\/langSet>\n<langSet xml:lang="$sTargetLang">\n<ntig id="$sTermEntryid$sTargetLang$nTCnt">\n<termGrp>\n<term>$2<\/term>\n<termNoteGrp>\n<termNote type="partOfSpeech"><\/termNote>\n<\/termNoteGrp>\n<\/termGrp>\n<\/ntig><\/langSet>\n<\/termEntry>/g; # Target then Destination
- $sLineIn =~ s/<termNote type="partOfSpeech"><\/termNote>\n//g; #Cut out empty tagging in tbx file
- $sLineIn =~ s/<termNoteGrp>\n<\/termNoteGrp>\n//g; #Cut out empty tagging in tbx file
- print FILEOUT $sLineIn."\n";
- }
- print FILEOUT "</body>\n</text>\n</martif>";
- print $nLCnt.' FINISHED:=';
Add Comment
Please, Sign In to add comment