Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- /*
- * Transposh v1.0.4.1
- * http://transposh.org/
- *
- * Copyright 2018, Team Transposh
- * Licensed under the GPL Version 2 or higher.
- * http://transposh.org/license
- *
- * Date: Mon, 31 Dec 2018 13:56:20 +0200
- */
- require_once("shd/simple_html_dom.php");
- require_once("constants.php");
- require_once("logging.php");
- require_once("utils.php");
- /**
- * parserstats class - holds parser statistics
- */
- class tp_parserstats {
- /** @var int Holds the total phrases the parser encountered */
- public $total_phrases;
- /** @var int Holds the number of phrases that had translation */
- public $translated_phrases;
- /** @var int Holds the number of phrases that had human translation */
- public $human_translated_phrases;
- /** @var int Holds the number of phrases that are hidden - yet still somewhat viewable (such as the title attribure) */
- public $hidden_phrases;
- /** @var int Holds the number of phrases that are hidden and translated */
- public $hidden_translated_phrases;
- /** @var int Holds the amounts of hidden spans created for translation */
- public $hidden_translateable_phrases;
- /** @var int Holds the number of phrases that are hidden and probably won't be viewed - such as meta keys */
- public $meta_phrases;
- /** @var int Holds the number of translated phrases that are hidden and probably won't be viewed - such as meta keys */
- public $meta_translated_phrases;
- /** @var float Holds the time translation took */
- public $time;
- /** @var int Holds the time translation started */
- private $start_time;
- /**
- * This function is when the object is initialized, which is a good time to start ticking.
- */
- function __construct() {
- $this->start_time = microtime(true);
- }
- /**
- * Calculated values - computer translated phrases
- * @return int How many phrases were auto-translated
- */
- function get_computer_translated_phrases() {
- return $this->translated_phrases - $this->human_translated_phrases;
- }
- /**
- * Calculated values - missing phrases
- * @return int How many phrases are missing
- */
- function get_missing_phrases() {
- return $this->total_phrases - $this->translated_phrases;
- }
- /**
- * Start the timer
- */
- function start_timing() {
- $this->start_time = microtime(true);
- }
- /**
- * Stop timing, store time for reference
- */
- function stop_timing() {
- $this->time = number_format(microtime(true) - $this->start_time, 3);
- }
- }
- /**
- * Parser class - allows phrase marking and translation with callback functions
- */
- class tp_parser {
- private $punct_breaks = true;
- private $num_breaks = true;
- private $ent_breaks = true;
- // functions that need to be defined... //
- /** @var function */
- public $url_rewrite_func = null;
- /** @var function */
- public $fetch_translate_func = null;
- /** @var function */
- public $prefetch_translate_func = null;
- /** @var function */
- public $split_url_func = null;
- /** @var function */
- public $fix_src_tag_func = null;
- /** @var int stores the number of the last used span_id */
- private $span_id = 0;
- /** @var simple_html_dom_node Contains the current node */
- private $currentnode;
- /** @var simple_html_dom Contains the document dom model */
- private $html;
- // the document
- public $dir_rtl;
- /** @var string Contains the iso of the target language */
- public $lang;
- /** @var boolean Contains the fact that this language is the default one (only parse other lanaguage spans) */
- public $default_lang = false;
- /** @var string Contains the iso of the source language - if a lang attribute is found, assumed to be en by default */
- public $srclang;
- private $inbody = false;
- /** @var hold fact that we are in select or other similar elements */
- private $inselect = false;
- public $is_edit_mode;
- public $is_auto_translate;
- public $feed_fix;
- /** @var boolean should we attempt to handle page as json */
- public $might_json = false;
- public $allow_ad = false;
- //first three are html, later 3 come from feeds xml (link is problematic...)
- protected $ignore_tags = array('script' => 1, 'style' => 1, 'code' => 1, 'wfw:commentrss' => 1, 'comments' => 1, 'guid' => 1);
- /** @var parserstats Contains parsing statistics */
- private $stats;
- /** @var boolean Are we inside a translated gettext */
- private $in_get_text = false;
- /** @var boolean Are we inside an inner text %s in gettext */
- private $in_get_text_inner = false;
- /** @var string Additional header information */
- public $added_header;
- /** @var array Contains reference to changable a tags */
- private $atags = array();
- /** @var array Contains reference to changable option values */
- private $otags = array();
- public $edit_span_created = false;
- /** @var array store all values that may be prefetched */
- private $prefetch_phrases = array();
- /**
- * Determine if the current position in buffer is a white space.
- * @param char $char
- * @return boolean true if current position marks a white space
- */
- function is_white_space($char) {
- if (!$char)
- return TRUE;
- return strspn($char, " \t\r\n\0\x0B");
- }
- /**
- * Determine if the current position in page points to a character in the
- * range of a-z (case insensetive).
- * @return boolean true if a-z
- */
- function is_a_to_z_character($char) {
- return (($char >= 'a' && $char <= 'z') || ($char >= 'A' && $char <= 'Z')) ? true : false;
- }
- /**
- * Determine if the current position is a digit.
- * @return boolean true if a digit
- */
- function is_digit($char) {
- return (($char >= '0' && $char <= '9')) ? true : false;
- }
- /**
- * Determine if the current position is an html entity - such as & or “.
- * @param string $string string to evalute
- * @param int $position where to check for entities
- * @return int length of entity
- */
- function is_html_entity($string, $position) {
- if ($string[$position] == '&') {
- $end_pos = $position + 1;
- while ($string[$end_pos] == '#' || $this->is_digit($string[$end_pos]) || $this->is_a_to_z_character($string[$end_pos]))
- ++$end_pos;
- if ($string[$end_pos] == ';')
- return $end_pos - $position + 1;
- }
- return 0;
- }
- /**
- * Some entities will not cause a break if they don't have whitespace after them
- * such as Jack`s apple.
- * `uncatagorized` will break on the later entity
- * Added " quotes to this claim, as it is used in some languages in a similar fashion
- * @param string $entity - html entity to check
- * @return boolean true if not a breaker (apostrophy)
- */
- function is_entity_breaker($entity) { // ‘’??
- return !(stripos('‘’'"''’‘”“', $entity) !== FALSE);
- }
- /**
- * Some entities are to be regarded as simple letters in most cases
- À À À À latin capital letter A with grave
- Á Á Á Á latin capital letter A with acute
- Â Â Â Â latin capital letter A with circumflex
- Ã Ã Ã Ã latin capital letter A with tilde
- Ä Ä Ä Ä latin capital letter A with diaeresis
- Å Å Å Å latin capital letter A with ring above
- Æ Æ Æ Æ latin capital letter AE
- Ç Ç Ç Ç latin capital letter C with cedilla
- È È È È latin capital letter E with grave
- É É É É latin capital letter E with acute
- Ê Ê Ê Ê latin capital letter E with circumflex
- Ë Ë Ë Ë latin capital letter E with diaeresis
- Ì Ì Ì Ì latin capital letter I with grave
- Í Í Í Í latin capital letter I with acute
- Î Î Î Î latin capital letter I with circumflex
- Ï Ï Ï Ï latin capital letter I with diaeresis
- Ð Ð Ð Ð latin capital letter ETH
- Ñ Ñ Ñ Ñ latin capital letter N with tilde
- Ò Ò Ò Ò latin capital letter O with grave
- Ó Ó Ó Ó latin capital letter O with acute
- Ô Ô Ô Ô latin capital letter O with circumflex
- Õ Õ Õ Õ latin capital letter O with tilde
- Ö Ö Ö Ö latin capital letter O with diaeresis
- //× × × × multiplication sign
- Ø Ø Ø Ø latin capital letter O with stroke
- Ù Ù Ù Ù latin capital letter U with grave
- Ú Ú Ú Ú latin capital letter U with acute
- Û Û Û Û latin capital letter U with circumflex
- Ü Ü Ü Ü latin capital letter U with diaeresis
- Ý Ý Ý Ý latin capital letter Y with acute
- Þ Þ Þ Þ latin capital letter THORN
- ß ß ß ß latin small letter sharp s
- à à à à latin small letter a with grave
- á á á á latin small letter a with acute
- â â â â latin small letter a with circumflex
- ã ã ã ã latin small letter a with tilde
- ä ä ä ä latin small letter a with diaeresis
- å å å å latin small letter a with ring above
- æ æ æ æ latin small letter ae
- ç ç ç ç latin small letter c with cedilla
- è è è è latin small letter e with grave
- é é é é latin small letter e with acute
- ê ê ê ê latin small letter e with circumflex
- ë ë ë ë latin small letter e with diaeresis
- ì ì ì ì latin small letter i with grave
- í í í í latin small letter i with acute
- î î î î latin small letter i with circumflex
- ï ï ï ï latin small letter i with diaeresis
- ð ð ð ð latin small letter eth
- ñ ñ ñ ñ latin small letter n with tilde
- ò ò ò ò latin small letter o with grave
- ó ó ó ó latin small letter o with acute
- ô ô ô ô latin small letter o with circumflex
- õ õ õ õ latin small letter o with tilde
- ö ö ö ö latin small letter o with diaeresis
- //÷ ÷ ÷ ÷ division sign
- ø ø ø ø latin small letter o with stroke
- ù ù ù ù latin small letter u with grave
- ú ú ú ú latin small letter u with acute
- û û û û latin small letter u with circumflex
- ü ü ü ü latin small letter u with diaeresis
- ý ý ý ý latin small letter y with acute
- þ þ þ þ latin small letter thorn
- ÿ ÿ ÿ ÿ latin small letter y with diaeresis
- Latin-1 extended
- Œ Œ latin capital ligature OE
- œ œ latin small ligature oe
- Š Š latin capital letter S with caron
- š š latin small letter s with caron
- Ÿ Ÿ latin capital letter Y with diaeresis
- */
- function is_entity_letter($entity) {
- tp_logger("checking ($entity) - " . htmlentities($entity), 4);
- $entnum = (int) substr($entity, 2);
- // skip multiply and divide (215, 247)
- if (($entnum >= 192 && $entnum <= 214) || ($entnum >= 216 && $entnum <= 246) || ($entnum >= 248 && $entnum <= 696)) {
- return true;
- }
- $entities = 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐ' .
- 'ÑÒÓÔÕÖØÙÚÛÜÝÞß' .
- 'øùÿœš ';
- return (stripos($entities, $entity) !== FALSE);
- }
- /**
- * Determine if the current position in buffer is a sentence breaker, e.g. '.' or ',' .
- * Note html markups are not considered sentence breaker within the scope of this function.
- * @param char $char charcter checked if breaker
- * @param char $nextchar needed for checking if . or - breaks
- * @return int length of breaker if current position marks a break in sentence
- */
- function is_sentence_breaker($char, $nextchar, $nextnextchar) {
- if (($char == '.' || $char == '-') && ($this->is_white_space($nextchar)))
- return 1;
- //,
- if (ord($char) == 239 && ord($nextchar) == 188 && ord($nextnextchar) == 140)
- return 3;
- //。
- if (ord($char) == 227 && ord($nextchar) == 128 && ord($nextnextchar) == 130)
- return 3;
- //、
- if (ord($char) == 227 && ord($nextchar) == 128 && ord($nextnextchar) == 129)
- return 3;
- //;
- if (ord($char) == 239 && ord($nextchar) == 188 && ord($nextnextchar) == 155)
- return 3;
- //:
- if (ord($char) == 239 && ord($nextchar) == 188 && ord($nextnextchar) == 154)
- return 3;
- //∙
- if (ord($char) == 226 && ord($nextchar) == 136 && ord($nextnextchar) == 153)
- return 3;
- //·
- if (ord($char) == 194 && ord($nextchar) == 183)
- return 2;
- return (strpos(',?()[]{}"!:|;' . TP_GTXT_BRK . TP_GTXT_BRK_CLOSER . TP_GTXT_IBRK . TP_GTXT_IBRK_CLOSER, $char) !== false) ? 1 : 0; // TODO: might need to add < and > here
- }
- /**
- * Determines if the current position marks the begining of a number, e.g. 123 050-391212232
- * @return int length of number.
- */
- function is_number($page, $position) {
- return strspn($page, '0123456789-+$%#*,.\\/', $position);
- }
- /**
- * Create a phrase tag in the html dom tree
- * @param int $start - beginning of phrase in element
- * @param int $end - end of phrase in element
- */
- function tag_phrase($string, $start, $end) {
- $phrase = trim(substr($string, $start, $end - $start));
- $phrasefixed = trim(str_replace(' ', ' ', $phrase));
- // $logstr = str_replace(array(chr(1),chr(2),chr(3),chr(4)), array('[1]','[2]','[3]','[4]'), $string);
- // tp_logger ("p:$phrasefixed, s:$logstr, st:$start, en:$end, gt:{$this->in_get_text}, gti:{$this->in_get_text_inner}");
- if ($this->in_get_text > $this->in_get_text_inner) {
- tp_logger('not tagging ' . $phrase . ' assumed gettext translated', 4);
- return;
- }
- if ($phrase) {
- tp_logger('tagged phrase: ' . $phrase, 4);
- $node = new simple_html_dom_node($this->html);
- $node->tag = 'phrase';
- $node->parent = $this->currentnode;
- $this->currentnode->nodes[] = $node;
- $node->_[HDOM_INFO_OUTER] = '';
- $node->phrase = $phrasefixed;
- $this->prefetch_phrases[$phrasefixed] = true;
- $node->start = $start;
- $node->len = strlen($phrase);
- if ($this->srclang)
- $node->srclang = $this->srclang;
- if ($this->inbody)
- $node->inbody = $this->inbody;
- if ($this->inselect)
- $node->inselect = true;
- }
- }
- /**
- * Breaks strings into substring according to some rules and common sense
- * @param string $string - the string which is "broken" into smaller strings
- */
- function parsetext($string) {
- $pos = 0;
- // $pos = skip_white_space($string, $pos);
- // skip CDATA in feed_fix mode
- if ($this->feed_fix) {
- if (strpos($string, '<![CDATA[') === 0) {
- $pos = 9; // CDATA length
- $string = substr($string, 0, -3); // chop the last ]]>;
- }
- }
- $start = $pos;
- while ($pos < strlen($string)) {
- // Some HTML entities make us break, almost all but apostrophies
- if ($this->ent_breaks && $len_of_entity = $this->is_html_entity($string, $pos)) {
- $entity = substr($string, $pos, $len_of_entity);
- if (($this->is_white_space(@$string[$pos + $len_of_entity]) || $this->is_entity_breaker($entity)) && !$this->is_entity_letter($entity)) {
- tp_logger("entity ($entity) breaks", 4);
- $this->tag_phrase($string, $start, $pos);
- $start = $pos + $len_of_entity;
- }
- // skip nbsp starting a phrase
- tp_logger("entity ($entity)", 4);
- if ($entity === ' ' && $start === $pos) {
- $start = $pos + $len_of_entity;
- }
- //skip past entity
- $pos += $len_of_entity;
- }
- // we have a special case for <> tags which might have came to us (maybe in xml feeds) (we'll skip them...)
- elseif ($string[$pos] == '<') {
- $this->tag_phrase($string, $start, $pos);
- while ($string[$pos] != '>' && $pos < strlen($string))
- $pos++;
- $pos++;
- $start = $pos;
- } elseif ($string[$pos] == TP_GTXT_BRK || $string[$pos] == TP_GTXT_BRK_CLOSER) {
- // $logstr = str_replace(array(chr(1),chr(2),chr(3),chr(4)), array('[1]','[2]','[3]','[4]'), $string);
- // $closers = ($string[$pos] == TP_GTXT_BRK) ? '': 'closer';
- // tp_logger(" $closers TEXT breaker $logstr start:$start pos:$pos gt:" . $this->in_get_text, 3);
- $this->tag_phrase($string, $start, $pos);
- ($string[$pos] == TP_GTXT_BRK) ? $this->in_get_text += 1 : $this->in_get_text -= 1;
- $pos++;
- $start = $pos;
- // reset state based on string start, no need to flip
- //$this->in_get_text = ($pos == 1);
- //if (!$this->in_get_text) $this->in_get_text_inner = false;
- } elseif ($string[$pos] == TP_GTXT_IBRK || $string[$pos] == TP_GTXT_IBRK_CLOSER) {
- // $logstr = str_replace(array(chr(1),chr(2),chr(3),chr(4)), array('[1]','[2]','[3]','[4]'), $string);
- // $closers = ($string[$pos] == TP_GTXT_IBRK) ? '': 'closer';
- // tp_logger(" $closers INNER text breaker $logstr start:$start pos:$pos gt:" . $this->in_get_text_inner, 3);
- //tp_logger("inner text breaker $start $pos $string " . (($this->in_get_text_inner) ? 'true' : 'false'), 5);
- $this->tag_phrase($string, $start, $pos);
- if ($this->in_get_text)
- ($string[$pos] == TP_GTXT_IBRK) ? $this->in_get_text_inner += 1 : $this->in_get_text_inner -= 1;
- $pos++;
- $start = $pos;
- //$this->in_get_text_inner = !$this->in_get_text_inner;
- }
- // will break translation unit when there's a breaker ",.[]()..."
- elseif ($this->punct_breaks && $senb_len = $this->is_sentence_breaker($string[$pos], @$string[$pos + 1], @$string[$pos + 2])) {
- // logger ("sentence breaker...");
- $this->tag_phrase($string, $start, $pos);
- $pos += $senb_len;
- $start = $pos;
- }
- // Numbers also break, if they are followed by whitespace (or a sentence breaker) (don't break 42nd) // TODO: probably by breaking entities too...
- // also prefixed by whitespace?
- elseif ($this->num_breaks && $num_len = $this->is_number($string, $pos)) {
- // logger ("numnum... $num_len");
- // this is the case of B2 or B2,
- if (($start == $pos) || ($this->is_white_space($string[$pos - 1]) || ($this->is_sentence_breaker(@$string[$pos + $num_len - 1], @$string[$pos + $num_len], @$string[$pos + $num_len + 1]))) &&
- ($this->is_white_space(@$string[$pos + $num_len]) || $this->is_sentence_breaker(@$string[$pos + $num_len], @$string[$pos + $num_len + 1], @$string[$pos + $num_len + 2]))) {
- // we will now compensate on the number followed by breaker case, if we need to
- // logger ("compensate part1?");
- if (!(($start == $pos) || $this->is_white_space($string[$pos - 1]))) {
- // logger ("compensate part2?");
- if ($this->is_sentence_breaker($string[$pos + $num_len - 1], @$string[$pos + $num_len], @$string[$pos + $num_len + 1])) {
- // logger ("compensate 3?");
- $num_len--; //this makes the added number shorter by one, and the pos will be at a sentence breaker next so we don't have to compensate
- }
- $pos += $num_len;
- $num_len = 0; // we have already added this
- }
- $this->tag_phrase($string, $start, $pos);
- $start = $pos + $num_len /* +1 */;
- }
- $pos += $num_len/* + 1 */;
- // logger ("numnumpos... $pos");
- } else {
- // smarter marking of start location
- if ($start == $pos && $this->is_white_space($string[$pos]))
- $start++;
- $pos++;
- }
- }
- // the end is also some breaker
- if ($pos > $start) {
- $this->tag_phrase($string, $start, $pos);
- }
- }
- /**
- * This recursive function works on the $html dom and adds phrase nodes to translate as needed
- * it currently also rewrites urls, and should consider if this is smart
- * @param simple_html_dom_node $node
- */
- function translate_tagging($node, $level = 0) {
- $this->currentnode = $node;
- // we don't want to translate non-translatable classes
- if (stripos($node->class, NO_TRANSLATE_CLASS) !== false || stripos($node->class, NO_TRANSLATE_CLASS_GOOGLE) !== false)
- return;
- // the node lang is the current node lang or its parent lang
- if ($node->lang) {
- // allow nesting of srclang (again - local var)
- $prevsrclang = $this->srclang;
- $this->srclang = strtolower($node->lang);
- // using a local variable scope for later
- $src_set_here = true;
- // eliminate the lang tag from the html, since we aim to translate it
- unset($node->lang);
- }
- // we can only do translation for elements which are in the body, not in other places, and this must
- // move here due to the possibility of early recurse in default language
- if ($node->tag == 'body') {
- $this->inbody = true;
- }
- // this again should be here, the different behaviour on select and textarea
- // for now - we assume that they can't include each other
- elseif ($node->tag == 'select' || $node->tag == 'textarea' || $node->tag == 'noscript') {
- $this->inselect = true;
- $inselect_set_here = true;
- }
- //support only_thislanguage class, (nulling the node if it should not display)
- if (isset($src_set_here) && $src_set_here && $this->srclang != $this->lang && stripos($node->class, ONLY_THISLANGUAGE_CLASS) !== false) {
- $this->srclang = $prevsrclang; //we should return to the previous src lang or it will be kept and carried
- $node->outertext = '';
- return;
- }
- // if we are in the default lang, and we have no foreign langs classes, we'll recurse from here
- // we also avoid processing if the node lang is the target lang
- if (($this->default_lang && !$this->srclang) || ($this->srclang === $this->lang)) {
- foreach ($node->nodes as $c) {
- $this->translate_tagging($c, $level + 1);
- }
- if (isset($src_set_here) && $src_set_here)
- $this->srclang = $prevsrclang;
- if (isset($inselect_set_here) && $inselect_set_here)
- $this->inselect = false;
- return;
- }
- if (isset($this->ignore_tags[$node->tag]))
- return;
- if ($node->tag == 'text') {
- // this prevents translation of a link that just surrounds its address
- if ($node->parent->tag == 'a' && $node->parent->href == $node->outertext) {
- return;
- }
- // link tags inners are to be ignored
- if ($node->parent->tag == 'link') {
- return;
- }
- if (trim($node->outertext)) {
- $this->parsetext($node->outertext);
- }
- }
- // for anchors we will rewrite urls if we can
- elseif ($node->tag == 'a') {
- array_push($this->atags, $node);
- }
- // same for options, although normally not required (ticket #34)
- elseif ($node->tag == 'option') {
- array_push($this->otags, $node);
- }
- // in submit type inputs, we want to translate the value
- elseif ($node->tag == 'input' && $node->type == 'submit') {
- $this->parsetext($node->value);
- }
- // for iframes we will rewrite urls if we can
- elseif ($node->tag == 'iframe') {
- if ($this->url_rewrite_func) {
- $node->src = call_user_func_array($this->url_rewrite_func, array($node->src));
- tp_logger('iframe: ' . $node->src, 4);
- }
- }
- // titles and placeholders are also good places to translate, exist in a, img, abbr, acronym
- if ($node->title) {
- $this->parsetext($node->title);
- }
- if ($node->placeholder) {
- $this->parsetext($node->placeholder);
- }
- if ($node->alt) {
- $this->parsetext($node->alt);
- }
- // Meta content (keywords, description) are also good places to translate (but not in robots... or http-equiv)
- if ($node->tag == 'meta' && $node->content && ($node->name != 'robots') && ($node->name != 'viewport') && ($node->{'http-equiv'} != 'Content-Type'))
- $this->parsetext($node->content);
- // recurse
- foreach ($node->nodes as $c) {
- $this->translate_tagging($c, $level + 1);
- }
- if (isset($src_set_here) && $src_set_here)
- $this->srclang = $prevsrclang;
- if (isset($inselect_set_here) && $inselect_set_here)
- $this->inselect = false;
- }
- /**
- * Creates a span used in translation and editing
- * @param string $original_text
- * @param string $translated_text
- * @param int $source (Either "0" for Human, "1" for Machine or "" for untouched)
- * @param boolean $for_hidden_element
- * @param string $src_lang - if source lang of element is different that default (eg. wrapped in lang="xx" attr)
- * @return string
- */
- function create_edit_span($original_text, $translated_text, $source, $for_hidden_element = false, $src_lang = '') {
- // Use base64 encoding to make that when the page is translated (i.e. update_translation) we
- // get back exactlly the same string without having the client decode/encode it in anyway.
- $this->edit_span_created = true;
- $span = '<span class ="' . SPAN_PREFIX . '" id="' . SPAN_PREFIX . $this->span_id . '" data-source="' . $source . '"';
- //$span = '<span class ="' . SPAN_PREFIX . '" id="' . SPAN_PREFIX . $this->span_id . '" data-token="' . transposh_utils::base64_url_encode($original_text) . '" data-source="' . $source . '"';
- // if we have a source language
- if ($src_lang) {
- $span .= ' data-srclang="' . $src_lang . '"';
- }
- // since orig replaces token too
- $span .= ' data-orig="' . $original_text . '"';
- // those are needed for hidden elements translations
- if ($for_hidden_element) {
- $span .= ' data-hidden="y"';
- // hidden elements currently have issues figuring what they translated in the JS
- if ($translated_text != null) {
- $span .= ' data-trans="' . $translated_text . '"';
- }
- }
- $span .= '>';
- if (!$for_hidden_element) {
- if ($translated_text)
- $span .= $translated_text;
- else
- $span .= $original_text;
- }
- $span .= '</span>';
- ++$this->span_id;
- return $span;
- }
- /**
- * This function does some ad replacement for transposh benefit
- */
- function do_ad_switch() {
- if (isset($this->html->noise) && is_array($this->html->noise)) {
- foreach ($this->html->noise as $key => $value) {
- if (strpos($value, 'google_ad_client') !== false) {
- $publoc = strpos($value, 'pub-');
- $sufloc = strpos($value, '"', $publoc);
- if (!$sufloc)
- $sufloc = strpos($value, "'", $publoc);
- echo $publoc . ' ' . $sufloc;
- if ($publoc && $sufloc)
- $this->html->noise[$key] = substr($value, 0, $publoc) . 'pub-7523823497771676' . substr($value, $sufloc);
- }
- }
- }
- // INS TAGS
- foreach ($this->html->find('ins') as $e) {
- $e->{'data-ad-client'} = 'ca-pub-7523823497771676';
- }
- }
- /**
- * Allow changing of parsing rules, yeah, I caved
- * @param type $puncts
- * @param type $numbers
- * @param type $entities
- */
- function change_parsing_rules($puncts, $numbers, $entities) {
- $this->punct_breaks = $puncts;
- $this->num_breaks = $numbers;
- $this->ent_breaks = $entities;
- }
- /**
- * Main function - actually translates a given HTML
- * @param string $string containing HTML
- * @return string Translated content is here
- */
- function fix_html($string) {
- // ready our stats
- $this->stats = new tp_parserstats();
- // handler for possible json (buddypress)
- if ($this->might_json) {
- if ($string[0] == '{') {
- $jsoner = json_decode($string);
- if ($jsoner != null) {
- tp_logger("json detected (buddypress?)", 4);
- // currently we only handle contents (which buddypress heavily use)
- if ($jsoner->contents) {
- $jsoner->contents = $this->fix_html($jsoner->contents);
- }
- if ($jsoner->fragments->{'div.widget_shopping_cart_content'}) {
- $jsoner->fragments->{'div.widget_shopping_cart_content'} = $this->fix_html($jsoner->fragments->{'div.widget_shopping_cart_content'});
- }
- if ($jsoner->fragments->{'div.kt-header-mini-cart-refreash'}) {
- $jsoner->fragments->{'div.kt-header-mini-cart-refreash'} = $this->fix_html($jsoner->fragments->{'div.kt-header-mini-cart-refreash'});
- }
- if ($jsoner->fragments->{'a.cart-contents'}) {
- $jsoner->fragments->{'a.cart-contents'} = $this->fix_html($jsoner->fragments->{'a.cart-contents'});
- }
- if ($jsoner->fragments->{'.woocommerce-checkout-review-order-table'}) {
- $jsoner->fragments->{'.woocommerce-checkout-review-order-table'} = $this->fix_html($jsoner->fragments->{'.woocommerce-checkout-review-order-table'});
- }
- if ($jsoner->fragments->{'.woocommerce-checkout-payment'}) {
- $jsoner->fragments->{'.woocommerce-checkout-payment'} = $this->fix_html($jsoner->fragments->{'.woocommerce-checkout-payment'});
- }
- return json_encode($jsoner); // now any attempted json will actually return a json
- }
- }
- }
- // create our dom
- $string = str_replace(chr(0xC2) . chr(0xA0), ' ', $string); // annoying NBSPs?
- $this->html = str_get_html($string, false); // false for RSS?
- //$this->stats->do_timing();
- //Log::info("Stats Build dom:" . $this->stats->time);
- // mark translateable elements
- if ($this->html->find('html', 0))
- $this->html->find('html', 0)->lang = ''; // Document defined lang may be preset to correct lang, but should be ignored TODO: Better?
- $this->translate_tagging($this->html->root);
- //$this->stats->do_timing();
- //Log::info("Stats Done tagging:" . $this->stats->time);
- // first fix the html tag itself - we might need to to the same for all such attributes with flipping
- if ($this->html->find('html', 0)) {
- if ($this->dir_rtl)
- $this->html->find('html', 0)->dir = 'rtl';
- else
- $this->html->find('html', 0)->dir = 'ltr';
- }
- if ($this->lang) {
- if ($this->html->find('html', 0))
- $this->html->find('html', 0)->lang = $this->lang;
- // add support for <meta name="language" content="<lang>">
- if ($this->html->find('meta[name=language]')) {
- @$this->html->find('meta[name=language]')->content = $this->lang;
- }
- }
- // not much point in further processing if we don't have a function that does it
- if ($this->fetch_translate_func == null) {
- return $this->html;
- }
- // fix feed
- if ($this->feed_fix) {
- // fix urls on feed
- tp_logger('fixing rss feed', 3);
- foreach (array('link', 'wfw:commentrss', 'comments') as $tag) {
- foreach ($this->html->find($tag) as $e) {
- $e->innertext = htmlspecialchars(call_user_func_array($this->url_rewrite_func, array($e->innertext)));
- // no need to translate anything here
- unset($e->nodes);
- }
- }
- // guid is not really a url -- in some future, we can check if permalink is true and probably falsify it
- foreach ($this->html->find('guid') as $e) {
- $e->innertext = $e->innertext . '-' . $this->lang;
- unset($e->nodes);
- }
- // fix feed language
- @$this->html->find('language', 0)->innertext = $this->lang;
- unset($this->html->find('language', 0)->nodes);
- } else {
- // since this is not a feed, we might have references to such in the <link rel="alternate">
- foreach ($this->html->find('link') as $e) {
- if (strcasecmp($e->rel, 'alternate') == 0 || strcasecmp($e->rel, 'canonical') == 0) {
- if (!$e->hreflang)
- $e->href = call_user_func_array($this->url_rewrite_func, array($e->href));
- }
- }
- }
- // try some prefetching... (//todo - maybe move directly to the phrase create)
- // $originals = array();
- if ($this->prefetch_translate_func != null) {
- /* foreach ($this->html->find('text') as $e) {
- foreach ($e->nodes as $ep) {
- if ($ep->phrase) $originals[$ep->phrase] = true;
- }
- }
- foreach (array('title', 'value', 'placeholder', 'alt') as $title) {
- foreach ($this->html->find('[' . $title . ']') as $e) {
- if (isset($e->nodes))
- foreach ($e->nodes as $ep) {
- if ($ep->phrase) $originals[$ep->phrase] = true;
- }
- }
- }
- foreach ($this->html->find('[content]') as $e) {
- foreach ($e->nodes as $ep) {
- if ($ep->phrase) $originals[$ep->phrase] = true;
- }
- } */
- // if we should split, we will split some urls for translation prefetching
- if ($this->split_url_func != null) {
- foreach ($this->atags as $e) {
- foreach (call_user_func_array($this->split_url_func, array($e->href)) as $part) {
- $this->prefetch_phrases[$part] = true;
- }
- }
- foreach ($this->otags as $e) {
- foreach (call_user_func_array($this->split_url_func, array($e->value)) as $part) {
- $this->prefetch_phrases[$part] = true;
- }
- }
- }
- call_user_func_array($this->prefetch_translate_func, array($this->prefetch_phrases, $this->lang));
- }
- //fix urls more
- // WORK IN PROGRESS
- /* foreach ($this->atags as $e) {
- $hrefspans = '';
- foreach (call_user_func_array($this->split_url_func, array($e->href)) as $part) {
- // fix - not for dashes
- list ($source, $translated_text) = call_user_func_array($this->fetch_translate_func, array($part, $this->lang));
- $hrefspans .= $this->create_edit_span($part, $translated_text, $source, true);
- }
- $e->href = call_user_func_array($this->url_rewrite_func, array($e->href));
- $e->outertext .= $hrefspans;
- } */
- // fix src for items
- if ($this->fix_src_tag_func !== null) {
- foreach ($this->html->find('[src]') as $e) {
- $e->src = call_user_func_array($this->fix_src_tag_func, array($e->src));
- }
- foreach ($this->html->find('link') as $e) {
- $e->href = call_user_func_array($this->fix_src_tag_func, array($e->href));
- }
- }
- // fix urls...
- foreach ($this->atags as $e) {
- if ($e->href)
- $e->href = call_user_func_array($this->url_rewrite_func, array($e->href));
- }
- foreach ($this->otags as $e) {
- if ($e->value)
- $e->value = call_user_func_array($this->url_rewrite_func, array($e->value));
- }
- // this is used to reserve spans we cannot add directly (out of body, metas, etc)
- $hiddenspans = '';
- $savedspan = '';
- // actually translate tags
- // texts are first
- foreach ($this->html->find('text') as $e) {
- $replace = array();
- foreach ($e->nodes as $ep) {
- list ($source, $translated_text) = call_user_func_array($this->fetch_translate_func, array($ep->phrase, $this->lang));
- //stats
- $this->stats->total_phrases++;
- if ($translated_text) {
- $this->stats->translated_phrases++;
- if ($source == 0)
- $this->stats->human_translated_phrases++;
- }
- if (($this->is_edit_mode || ($this->is_auto_translate && $translated_text == null))/* && $ep->inbody */) {
- if ($ep->inselect) {
- $savedspan .= $this->create_edit_span($ep->phrase, $translated_text, $source, true, $ep->srclang);
- } elseif (!$ep->inbody) {
- $hiddenspans .= $this->create_edit_span($ep->phrase, $translated_text, $source, true, $ep->srclang);
- } else {
- $translated_text = $this->create_edit_span($ep->phrase, $translated_text, $source, false, $ep->srclang);
- }
- }
- // store replacements
- if ($translated_text) {
- $replace[] = array($translated_text, $ep);
- }
- }
- // do replacements in reverse
- foreach (array_reverse($replace) as $epag) {
- list($replacetext, $epg) = $epag;
- $e->outertext = substr_replace($e->outertext, $replacetext, $epg->start, $epg->len);
- }
- // this adds saved spans to the first not in select element which is in the body
- if ($e->nodes && !$ep->inselect && $savedspan && $ep->inbody) { // (TODO: might not be...?)
- $e->outertext = $savedspan . $e->outertext;
- $savedspan = '';
- }
- }
- // now we handle the title attributes (and the value of submit buttons)
- $hidden_phrases = array();
- foreach (array('title', 'value', 'placeholder', 'alt') as $title) {
- foreach ($this->html->find('[' . $title . ']') as $e) {
- $replace = array();
- $span = '';
- // when we already have a parent outertext we'll have to update it directly
- if (isset($e->parent->_[HDOM_INFO_OUTER])) {
- $saved_outertext = $e->outertext;
- }
- tp_logger("$title-original: $e->$title}", 4);
- if (isset($e->nodes))
- foreach ($e->nodes as $ep) {
- if ($ep->tag == 'phrase') {
- list ($source, $translated_text) = call_user_func_array($this->fetch_translate_func, array($ep->phrase, $this->lang));
- // more stats
- $this->stats->total_phrases++;
- if ($ep->inbody)
- $this->stats->hidden_phrases++;
- else
- $this->stats->meta_phrases++;
- if ($translated_text) {
- $this->stats->translated_phrases++;
- if ($ep->inbody)
- $this->stats->hidden_translated_phrases++;
- else
- $this->stats->meta_translated_phrases++;
- if ($source == 0)
- $this->stats->human_translated_phrases++;
- }
- if (($this->is_edit_mode || ($this->is_auto_translate && $translated_text == null)) && $ep->inbody) {
- // prevent duplicate translation (title = text)
- if (strpos($e->innertext, $ep->phrase /* Transposh_utils::base64_url_encode($ep->phrase) */) === false) {
- // if (strpos($e->innertext, transposh_utils::base64_url_encode($ep->phrase)) === false) {
- //no need to translate span the same hidden phrase more than once
- if (!in_array($ep->phrase, $hidden_phrases)) {
- $this->stats->hidden_translateable_phrases++;
- $span .= $this->create_edit_span($ep->phrase, $translated_text, $source, true, $ep->srclang);
- // logger ($span);
- $hidden_phrases[] = $ep->phrase;
- }
- }
- }
- // if we need to replace, we store this
- if ($translated_text) {
- $replace[$translated_text] = $ep;
- }
- }
- }
- // and later replace
- foreach (array_reverse($replace, true) as $replace => $epg) {
- $e->$title = substr_replace($e->$title, $replace, $epg->start, $epg->len);
- }
- $e->outertext .= $span;
- // this is where we update in the outercase issue
- if (isset($e->parent->_[HDOM_INFO_OUTER])) {
- $e->parent->outertext = implode($e->outertext, explode($saved_outertext, $e->parent->outertext, 2));
- }
- }
- }
- // now we handle the meta content - which is simpler because they can't be edited or auto-translated in place
- // we also don't expect any father modifications here
- // so we now add all those spans right before the <body> tag end
- foreach ($this->html->find('[content]') as $e) {
- $right = '';
- $newtext = '';
- foreach ($e->nodes as $ep) {
- if ($ep->tag == 'phrase') {
- // even more stats
- $this->stats->total_phrases++;
- $this->stats->meta_phrases++;
- list ($source, $translated_text) = call_user_func_array($this->fetch_translate_func, array($ep->phrase, $this->lang));
- if ($translated_text) {
- $this->stats->translated_phrases++;
- $this->stats->meta_translated_phrases++;
- if ($source == 0)
- $this->stats->human_translated_phrases++;
- list ($left, $right) = explode($ep->phrase, $e->content, 2);
- $newtext .= $left . $translated_text;
- $e->content = $right;
- }
- if ($this->is_edit_mode) {
- $hiddenspans .= $this->create_edit_span($ep->phrase, $translated_text, $source, true, $ep->srclang);
- }
- if (!$translated_text && $this->is_auto_translate && !$this->is_edit_mode) {
- tp_logger('untranslated meta for ' . $ep->phrase . ' ' . $this->lang);
- if ($this->is_edit_mode || $this->is_auto_translate) { // FIX
- }
- }
- }
- }
- if ($newtext) {
- $e->content = $newtext . $right;
- tp_logger("content-phrase: $newtext", 4);
- }
- }
- if ($hiddenspans) {
- $body = $this->html->find('body', 0);
- if ($body != null)
- $body->lastChild()->outertext .= $hiddenspans;
- }
- // we might show an ad for transposh in some cases
- if (($this->allow_ad && !$this->default_lang && mt_rand(1, 100) > 95) || // 5 of 100 for translated non default language pages
- ($this->allow_ad && $this->default_lang && mt_rand(1, 100) > 99) || // 1 of 100 for translated default languages pages
- (!$this->allow_ad && mt_rand(1, 1000) > 999)) { // 1 of 1000 otherwise
- $this->do_ad_switch();
- }
- // This adds a meta tag with our statistics json-encoded inside...
- // $this->stats->do_timing();
- // Log::info("Stats Done:" . $this->stats->time);
- $head = $this->html->find('head', 0);
- if ($this->edit_span_created) {
- if ($head != null) {
- $head->lastChild()->outertext .= $this->added_header;
- }
- }
- //exit;
- if ($head != null)
- $head->lastChild()->outertext .= "\n<meta name=\"translation-stats\" content='" . json_encode($this->stats) . "'/>";
- // we make sure that the result is clear from our shananigans
- return str_replace(array(TP_GTXT_BRK, TP_GTXT_IBRK, TP_GTXT_BRK_CLOSER, TP_GTXT_IBRK_CLOSER), '', $this->html->outertext);
- // Changed because of places where tostring failed
- //return $this->html;
- //return $this->html->outertext;
- }
- /**
- * This functions returns a list of phrases from a given HTML string
- * @param string $string Html with phrases to extract
- * @return array List of phrases (or an empty one)
- * @since 0.3.5
- */
- function get_phrases_list($string) {
- $result = array();
- // create our dom
- $this->html = str_get_html('<span lang="xx">' . $string . '</span>');
- // mark translateable elements
- $this->translate_tagging($this->html->root);
- foreach ($this->html->nodes as $ep) {
- if ($ep->tag == 'phrase') {
- $result[$ep->phrase] = $ep->phrase;
- }
- }
- return $result;
- }
- }
- ?>
Add Comment
Please, Sign In to add comment