Untitled

#!/usr/bin/perl -w
use strict;
use utf8;


# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
my $sPath= print $ARGV[0];       #lingotek file name
my $sSourceLang= print $ARGV[1];       #original language
my $sTargetLang= print $ARGV[2];       #target language
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *


# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

my $FILEIN;
my $FILEOUT;
my $sLineIn;
my $nLCnt=0;     #line counter
my $nTCnt=0;     #term entry counter
my $sTermEntryid ="L00";

open (FILEIN, "<:encoding(utf8)", $sPath."l480.csv") or die "Can't open INPUT file.\n";
open (FILEOUT, ">:encoding(utf8)", $sPath."tbx.tbx") or die "Can't open OUTPUT file.\n";


# Creates the header of the TBX file
print FILEOUT "<?xml version='1.0' encoding=\"UTF-8\"?>\n<!DOCTYPE martif SYSTEM \"TBXcoreStructV02.dtd\">\n<martif type=\"TBX\" xml:lang=\"$sSourceLang\">\n";
print FILEOUT "<martifHeader>\n<fileDesc>\n<sourceDesc>\n<p>Made by the Lingotek to TBX converter</p>\n</sourceDesc>\n</fileDesc>\n";
print FILEOUT "<encodingDesc>\n<p type=\"DCSName\">TBXbasicV2.xcs</p>\n<\/encodingDesc>\n<\/martifHeader>\n<text>\n<body>\n\n";

while ($sLineIn = <FILEIN>)
{
    chomp $sLineIn;
    #$nLCnt++;
    #if ($nLCnt % 500 == 1)  {print $nLCnt.' '; }

if ($sLineIn=~ m/([A-Za-z0-9 ]+)\t([A-Za-z0-9 ]+)/)
{
    $nTCnt++;
}
    $sLineIn =~ s/([A-Za-z0-9 ]+)\t([A-Za-z0-9 ]+)/<termEntry id="$sTermEntryid$nTCnt">\n<langSet xml:lang="$sSourceLang">\n<ntig id="$sTermEntryid$sSourceLang$nTCnt">\n<termGrp>\n<term>$1<\/term>\n<termNoteGrp>\n<termNote type="partOfSpeech"><\/termNote>\n<\/termNoteGrp>\n<\/termGrp>\n<\/ntig>\n<\/langSet>\n<langSet xml:lang="$sTargetLang">\n<ntig id="$sTermEntryid$sTargetLang$nTCnt">\n<termGrp>\n<term>$2<\/term>\n<termNoteGrp>\n<termNote type="partOfSpeech"><\/termNote>\n<\/termNoteGrp>\n<\/termGrp>\n<\/ntig><\/langSet>\n<\/termEntry>/g;      # Target then Destination

    $sLineIn =~ s/<termNote type="partOfSpeech"><\/termNote>\n//g;  #Cut out empty tagging in tbx file
    $sLineIn =~ s/<termNoteGrp>\n<\/termNoteGrp>\n//g;                #Cut out empty tagging in tbx file

    print FILEOUT $sLineIn."\n";
}
print FILEOUT "</body>\n</text>\n</martif>";
print $nLCnt.' FINISHED:=';