Advertisement
jatinluthra14

Servlet.py

Jan 8th, 2016
17
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 34.04 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # -*- indent-tabs-mode: nil -*-
  3. # coding=utf-8
  4. # -*- encoding: utf-8 -*-
  5.  
  6. import sys, os, re, ssl, argparse, logging, time, signal, tempfile, zipfile
  7. from subprocess import Popen, PIPE
  8. from multiprocessing import Pool, TimeoutError
  9. from functools import wraps
  10. from threading import Thread
  11. from datetime import datetime
  12. import datetime
  13. import heapq
  14.  
  15. import tornado, tornado.web, tornado.httpserver, tornado.process, tornado.iostream
  16. from tornado import escape, gen
  17. from tornado.escape import utf8
  18. try: #3.1
  19. from tornado.log import enable_pretty_logging
  20. except ImportError: #2.1
  21. from tornado.options import enable_pretty_logging
  22.  
  23. from modeSearch import searchPath
  24. from util import getLocalizedLanguages, apertium, bilingualTranslate, stripTags, processPerWord, getCoverage, getCoverages, toAlpha3Code, toAlpha2Code, noteUnknownToken, scaleMtLog, TranslationInfo, closeDb, flushUnknownWords, inMemoryUnknownToken
  25. import translation
  26. import util
  27. from keys import getKey
  28. global nowords
  29. global tnowords
  30. global startwordtime
  31. global endwordtime
  32. global wordtime
  33. global twordtime
  34. nowords = 0
  35. tnowords = 0
  36. startwordtime = 0
  37. endwordtime = 0
  38. wordtime = 0
  39. twordtime = 0
  40.  
  41. try:
  42. import cld2full as cld2
  43. except:
  44. cld2 = None
  45.  
  46. def run_async_thread(func):
  47. @wraps(func)
  48. def async_func(*args, **kwargs):
  49. func_hl = Thread(target = func, args = args, kwargs = kwargs)
  50. func_hl.start()
  51. return func_hl
  52.  
  53. return async_func
  54.  
  55. def sig_handler(sig, frame):
  56. global tnowords
  57. global wordtime
  58. global missingFreqsDb
  59. if missingFreqsDb:
  60. if 'children' in frame.f_locals:
  61. for child in frame.f_locals['children']:
  62. os.kill(child, signal.SIGTERM)
  63. flushUnknownWords(missingFreqsDb)
  64. else: # we are one of the children
  65. flushUnknownWords(missingFreqsDb)
  66. logging.warning('Caught signal: %s', sig)
  67. logging.warning('Total Words: %s', tnowords)
  68. logging.info('Total Words: %d', wordtime)
  69. closeDb()
  70. exit()
  71.  
  72. class BaseHandler(tornado.web.RequestHandler):
  73. pairs = {}
  74. analyzers = {}
  75. generators = {}
  76. taggers = {}
  77. pipelines = {} # (l1, l2): [translation.Pipeline], only contains flushing pairs!
  78. pipelines_holding = []
  79. callback = None
  80. timeout = None
  81. scaleMtLogs = False
  82. inMemoryUnknown = False
  83. inMemoryLimit = -1
  84. verbosity = 0
  85.  
  86. stats = {
  87. 'useCount': {},
  88. 'vmsize': 0,
  89. }
  90.  
  91. pipeline_cmds = {} # (l1, l2): translation.ParsedModes
  92. max_pipes_per_pair = 1
  93. min_pipes_per_pair = 0
  94. max_users_per_pipe = 5
  95. max_idle_secs = 0
  96. restart_pipe_after = 1000
  97.  
  98. def initialize(self):
  99. self.callback = self.get_argument('callback', default=None)
  100.  
  101. def log_vmsize(self):
  102. if self.verbosity < 1:
  103. return
  104. scale = {'kB': 1024, 'mB': 1048576,
  105. 'KB': 1024, 'MB': 1048576}
  106. try:
  107. for line in open('/proc/%d/status' % os.getpid()):
  108. if line.startswith('VmSize:'):
  109. _, num, unit = line.split()
  110. break
  111. vmsize = int(num) * scale[unit]
  112. if vmsize > self.stats['vmsize']:
  113. logging.warning("VmSize of %s from %d to %d" % (os.getpid(), self.stats['vmsize'], vmsize))
  114. self.stats['vmsize'] = vmsize
  115. except:
  116. # don't let a stupid logging function mess us up
  117. pass
  118.  
  119.  
  120. def sendResponse(self, data):
  121. self.log_vmsize()
  122. if isinstance(data, dict) or isinstance(data, list):
  123. data = escape.json_encode(data)
  124. self.set_header('Content-Type', 'application/json; charset=UTF-8')
  125.  
  126. if self.callback:
  127. self.set_header('Content-Type', 'application/javascript; charset=UTF-8')
  128. self._write_buffer.append(utf8('%s(%s)' % (self.callback, data)))
  129. else:
  130. self._write_buffer.append(utf8(data))
  131. self.finish()
  132.  
  133. def write_error(self, status_code, **kwargs):
  134. # TODO: Is there a tornado fn to get the full list?
  135. http_messages = {
  136. 400: 'Bad Request',
  137. 404: 'Not Found',
  138. 408: 'Request Timeout',
  139. 500: 'Internal Error'
  140. }
  141.  
  142. result = {
  143. 'status': 'error',
  144. 'code': status_code,
  145. 'message': http_messages.get(status_code, ''),
  146. 'explanation': kwargs.get('explanation', '')
  147. }
  148.  
  149. data = escape.json_encode(result)
  150. self.set_header('Content-Type', 'application/json; charset=UTF-8')
  151.  
  152. if self.callback:
  153. self.set_header('Content-Type', 'application/javascript; charset=UTF-8')
  154. self._write_buffer.append(utf8('%s(%s)' % (self.callback, data)))
  155. else:
  156. self._write_buffer.append(utf8(data))
  157. self.finish()
  158.  
  159. def set_default_headers(self):
  160. self.set_header('Access-Control-Allow-Origin', '*')
  161. self.set_header('Access-Control-Allow-Methods', 'GET,POST,OPTIONS')
  162. self.set_header('Access-Control-Allow-Headers', 'accept, cache-control, origin, x-requested-with, x-file-name, content-type')
  163.  
  164. @tornado.web.asynchronous
  165. def post(self):
  166. self.get()
  167.  
  168. def options(self):
  169. self.set_status(204)
  170. self.finish()
  171.  
  172. class ListHandler(BaseHandler):
  173. @tornado.web.asynchronous
  174. def get(self):
  175. query = self.get_argument('q', default='pairs')
  176.  
  177. if query == 'pairs':
  178. responseData = []
  179. for pair in self.pairs:
  180. (l1, l2) = pair.split('-')
  181. responseData.append({'sourceLanguage': l1, 'targetLanguage': l2})
  182. if self.get_arguments('include_deprecated_codes'):
  183. responseData.append({'sourceLanguage': toAlpha2Code(l1), 'targetLanguage': toAlpha2Code(l2)})
  184. self.sendResponse({'responseData': responseData, 'responseDetails': None, 'responseStatus': 200})
  185. elif query == 'analyzers' or query == 'analysers':
  186. self.sendResponse({pair: modename for (pair, (path, modename)) in self.analyzers.items()})
  187. elif query == 'generators':
  188. self.sendResponse({pair: modename for (pair, (path, modename)) in self.generators.items()})
  189. elif query == 'taggers' or query == 'disambiguators':
  190. self.sendResponse({pair: modename for (pair, (path, modename)) in self.taggers.items()})
  191. else:
  192. self.send_error(400, explanation='Expecting q argument to be one of analysers, generators, disambiguators or pairs')
  193.  
  194. class StatsHandler(BaseHandler):
  195. @tornado.web.asynchronous
  196. def get(self):
  197. self.sendResponse({
  198. 'responseData': {
  199. 'useCount': { '%s-%s' % pair: useCount
  200. for pair, useCount in self.stats['useCount'].items() },
  201. 'runningPipes': { '%s-%s' % pair: len(pipes)
  202. for pair, pipes in self.pipelines.items()
  203. if pipes != [] },
  204. 'holdingPipes': len(self.pipelines_holding),
  205. },
  206. 'responseDetails': None,
  207. 'responseStatus': 200
  208. })
  209.  
  210. class RootHandler(BaseHandler):
  211. @tornado.web.asynchronous
  212. def get(self):
  213. self.redirect("http://wiki.apertium.org/wiki/Apertium-apy")
  214.  
  215. class TranslateHandler(BaseHandler):
  216. def notePairUsage(self, pair):
  217. self.stats['useCount'][pair] = 1 + self.stats['useCount'].get(pair, 0)
  218.  
  219. unknownMarkRE = re.compile(r'\*([^.,;:\t\* ]+)')
  220. def maybeStripMarks(self, markUnknown, l1, l2, translated):
  221. self.noteUnknownTokens("%s-%s" % (l1, l2), translated)
  222. if markUnknown:
  223. return translated
  224. else:
  225. return re.sub(self.unknownMarkRE, r'\1', translated)
  226.  
  227. def noteUnknownTokens(self, pair, text):
  228. if self.missingFreqs:
  229. for token in re.findall(self.unknownMarkRE, text):
  230. if self.inMemoryUnknown:
  231. inMemoryUnknownToken(token, pair, self.missingFreqs, self.inMemoryLimit)
  232. else:
  233. noteUnknownToken(token, pair, self.missingFreqs)
  234.  
  235. def cleanable(self, i, pair, pipe):
  236. if pipe.useCount > self.restart_pipe_after:
  237. # Not affected by min_pipes_per_pair
  238. logging.info('A pipe for pair %s-%s has handled %d requests, scheduling restart',
  239. pair[0], pair[1], self.restart_pipe_after)
  240. return True
  241. elif (i >= self.min_pipes_per_pair
  242. and self.max_idle_secs != 0
  243. and time.time() - pipe.lastUsage > self.max_idle_secs):
  244. logging.info("A pipe for pair %s-%s hasn't been used in %d secs, scheduling shutdown",
  245. pair[0], pair[1], self.max_idle_secs)
  246. return True
  247. else:
  248. return False
  249.  
  250. def cleanPairs(self):
  251. for pair in self.pipelines:
  252. pipes = self.pipelines[pair]
  253. to_clean = set(p for i, p in enumerate(pipes)
  254. if self.cleanable(i, pair, p))
  255. self.pipelines_holding += to_clean
  256. pipes[:] = [p for p in pipes if not p in to_clean]
  257. heapq.heapify(pipes)
  258. # The holding area lets us restart pipes after n usages next
  259. # time round, since with lots of traffic an active pipe may
  260. # never reach 0 users
  261. self.pipelines_holding[:] = [p for p in self.pipelines_holding
  262. if p.users > 0]
  263. if self.pipelines_holding:
  264. logging.info("%d pipelines still scheduled for shutdown", len(self.pipelines_holding))
  265.  
  266. def getPipeCmds(self, l1, l2):
  267. if (l1, l2) not in self.pipeline_cmds:
  268. mode_path = self.pairs['%s-%s' % (l1, l2)]
  269. self.pipeline_cmds[(l1, l2)] = translation.parseModeFile(mode_path)
  270. return self.pipeline_cmds[(l1, l2)]
  271.  
  272. def shouldStartPipe(self, l1, l2):
  273. pipes = self.pipelines.get((l1, l2), [])
  274. if pipes == []:
  275. logging.info("%s-%s not in pipelines of this process",
  276. l1, l2)
  277. return True
  278. else:
  279. min_p = pipes[0]
  280. if len(pipes) < self.max_pipes_per_pair and min_p.users > self.max_users_per_pipe:
  281. logging.info("%s-%s has ≥%d users per pipe but only %d pipes",
  282. l1, l2, min_p.users, len(pipes))
  283. return True
  284. else:
  285. return False
  286.  
  287. def getPipeline(self, l1, l2):
  288. pair = (l1, l2)
  289. if self.shouldStartPipe(l1, l2):
  290. logging.info("Starting up a new pipeline for %s-%s …", l1, l2)
  291. if not pair in self.pipelines:
  292. self.pipelines[pair] = []
  293. p = translation.makePipeline(self.getPipeCmds(l1, l2))
  294. heapq.heappush(self.pipelines[pair], p)
  295. return self.pipelines[pair][0]
  296.  
  297. def logBeforeTranslation(self):
  298. if self.scaleMtLogs:
  299. return datetime.now()
  300. return
  301.  
  302. def logAfterTranslation(self, before, toTranslate):
  303. global wordtime
  304. global twordtime
  305. if self.scaleMtLogs:
  306. after = datetime.now()
  307. tInfo = TranslationInfo(self)
  308. key = getKey(tInfo.key)
  309. wordtime = after-before
  310. twordtime += wordtime
  311. scaleMtLog(self.get_status(), after-before, tInfo, key, len(toTranslate))
  312.  
  313.  
  314. @gen.coroutine
  315. def get(self):
  316. global nowords
  317. global tnowords
  318. global wordtime
  319. global twordtime
  320. toTranslate = self.get_argument('q')
  321. markUnknown = self.get_argument('markUnknown', default='yes') in ['yes', 'true', '1']
  322. nowords = len(toTranslate.split())
  323. tnowords += nowords
  324.  
  325. try:
  326. l1, l2 = map(toAlpha3Code, self.get_argument('langpair').split('|'))
  327. except ValueError:
  328. self.send_error(400, explanation='That pair is invalid, use e.g. eng|spa')
  329. if self.scaleMtLogs:
  330. global wordtime
  331. before = datetime.now()
  332. tInfo = TranslationInfo(self)
  333. key = getKey(tInfo.key)
  334. after = datetime.now()
  335. wordtime = after-before
  336. twordtime += wordtime
  337. scaleMtLog(400, after-before, tInfo, key, len(toTranslate))
  338. return
  339.  
  340. if '%s-%s' % (l1, l2) in self.pairs:
  341. global wordtime
  342. before = self.logBeforeTranslation()
  343. pipeline = self.getPipeline(l1, l2)
  344. self.notePairUsage((l1, l2))
  345. translated = yield pipeline.translate(toTranslate)
  346. self.logAfterTranslation(before, toTranslate)
  347. self.sendResponse({
  348. 'responseData': {
  349. 'translatedText': self.maybeStripMarks(markUnknown, l1, l2, translated)
  350. },
  351. 'responseDetails': None,
  352. 'responseStatus': 200,
  353. 'No of words': nowords,
  354. })
  355. self.cleanPairs()
  356. else:
  357. self.send_error(400, explanation='That pair is not installed')
  358. if self.scaleMtLogs:
  359. before = datetime.now()
  360. tInfo = TranslationInfo(self)
  361. key = getKey(tInfo.key)
  362. after = datetime.now()
  363. wordtime = after-before
  364. twordtime += wordtime
  365. scaleMtLog(400, after-before, tInfo, key, len(toTranslate))
  366.  
  367. class TranslateDocHandler(TranslateHandler):
  368. mimeTypeCommand = None
  369.  
  370. def getMimeType(self, f):
  371. commands = {
  372. 'mimetype': lambda x: Popen(['mimetype', '-b', x], stdout=PIPE).communicate()[0].strip(),
  373. 'xdg-mime': lambda x: Popen(['xdg-mime', 'query', 'filetype', x], stdout=PIPE).communicate()[0].strip(),
  374. 'file': lambda x: Popen(['file', '--mime-type', '-b', x], stdout=PIPE).communicate()[0].strip()
  375. }
  376.  
  377. typeFiles = {
  378. 'word/document.xml': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
  379. 'ppt/presentation.xml': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
  380. 'xl/workbook.xml': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
  381. }
  382.  
  383. if not self.mimeTypeCommand:
  384. for command in ['mimetype', 'xdg-mime', 'file']:
  385. if Popen(['which', command], stdout=PIPE).communicate()[0]:
  386. TranslateDocHandler.mimeTypeCommand = command
  387. break
  388.  
  389. mimeType = commands[self.mimeTypeCommand](f).decode('utf-8')
  390. if mimeType == 'application/zip':
  391. with zipfile.ZipFile(f) as zf:
  392. for typeFile in typeFiles:
  393. if typeFile in zf.namelist():
  394. return typeFiles[typeFile]
  395.  
  396. if 'mimetype' in zf.namelist():
  397. return zf.read('mimetype').decode('utf-8')
  398.  
  399. return mimeType
  400.  
  401. else:
  402. return mimeType
  403.  
  404. @tornado.web.asynchronous
  405. def get(self):
  406. try:
  407. l1, l2 = map(toAlpha3Code, self.get_argument('langpair').split('|'))
  408. except ValueError:
  409. self.send_error(400, explanation='That pair is invalid, use e.g. eng|spa')
  410.  
  411. allowedMimeTypes = {
  412. 'text/plain': 'txt',
  413. 'text/html': 'html-noent',
  414. 'text/rtf': 'rtf',
  415. 'application/rtf': 'rtf',
  416. 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
  417. 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
  418. 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
  419. # 'application/msword', 'application/vnd.ms-powerpoint', 'application/vnd.ms-excel'
  420. 'application/vnd.oasis.opendocument.text': 'odt',
  421. 'application/x-latex': 'latex',
  422. 'application/x-tex': 'latex'
  423. }
  424.  
  425. if '%s-%s' % (l1, l2) in self.pairs:
  426. body = self.request.files['file'][0]['body']
  427. if len(body) > 32E6:
  428. self.send_error(413, explanation='That file is too large')
  429. else:
  430. with tempfile.NamedTemporaryFile() as tempFile:
  431. tempFile.write(body)
  432. tempFile.seek(0)
  433.  
  434. mtype = self.getMimeType(tempFile.name)
  435. if mtype in allowedMimeTypes:
  436. self.request.headers['Content-Type'] = 'application/octet-stream'
  437. self.request.headers['Content-Disposition'] = 'attachment'
  438.  
  439. self.write(translation.translateDoc(tempFile, allowedMimeTypes[mtype], self.pairs['%s-%s' % (l1, l2)]))
  440. self.finish()
  441. else:
  442. self.send_error(400, explanation='Invalid file type %s' % mtype)
  443. else:
  444. self.send_error(400, explanation='That pair is not installed')
  445.  
  446. class AnalyzeHandler(BaseHandler):
  447. def postproc_text(self, in_text, result):
  448. lexical_units = util.removeDotFromDeformat(in_text, re.findall(r'\^([^\$]*)\$([^\^]*)', result))
  449. return [(lu[0], lu[0].split('/')[0] + lu[1])
  450. for lu
  451. in lexical_units]
  452.  
  453. @tornado.web.asynchronous
  454. @gen.coroutine
  455. def get(self):
  456. in_text = self.get_argument('q')
  457. in_mode = toAlpha3Code(self.get_argument('lang'))
  458. if in_mode in self.analyzers:
  459. [path, mode] = self.analyzers[in_mode]
  460. formatting = 'txt'
  461. commands = [['apertium', '-d', path, '-f', formatting, mode]]
  462. result = yield translation.translateSimple(in_text, commands)
  463. self.sendResponse(self.postproc_text(in_text, result))
  464. else:
  465. self.send_error(400, explanation='That mode is not installed')
  466.  
  467.  
  468. class GenerateHandler(BaseHandler):
  469. def preproc_text(self, in_text):
  470. lexical_units = re.findall(r'(\^[^\$]*\$[^\^]*)', in_text)
  471. if len(lexical_units) == 0:
  472. lexical_units = ['^%s$' % (in_text,)]
  473. return lexical_units, '[SEP]'.join(lexical_units)
  474.  
  475. def postproc_text(self, lexical_units, result):
  476. return [(generation, lexical_units[i])
  477. for (i, generation)
  478. in enumerate(result.split('[SEP]'))]
  479.  
  480. @tornado.web.asynchronous
  481. @gen.coroutine
  482. def get(self):
  483. in_text = self.get_argument('q')
  484. in_mode = toAlpha3Code(self.get_argument('lang'))
  485. if in_mode in self.generators:
  486. [path, mode] = self.generators[in_mode]
  487. formatting = 'none'
  488. commands = [['apertium', '-d', path, '-f', formatting, mode]]
  489. lexical_units, to_generate = self.preproc_text(in_text)
  490. result = yield translation.translateSimple(to_generate, commands)
  491. self.sendResponse(self.postproc_text(lexical_units, result))
  492. else:
  493. self.send_error(400, explanation='That mode is not installed')
  494.  
  495. class ListLanguageNamesHandler(BaseHandler):
  496. @tornado.web.asynchronous
  497. def get(self):
  498. localeArg = self.get_argument('locale')
  499. languagesArg = self.get_argument('languages', default=None)
  500.  
  501. if self.langNames:
  502. if localeArg:
  503. if languagesArg:
  504. self.sendResponse(getLocalizedLanguages(localeArg, self.langNames, languages=languagesArg.split(' ')))
  505. else:
  506. self.sendResponse(getLocalizedLanguages(localeArg, self.langNames))
  507. elif 'Accept-Language' in self.request.headers:
  508. locales = [locale.split(';')[0] for locale in self.request.headers['Accept-Language'].split(',')]
  509. for locale in locales:
  510. languageNames = getLocalizedLanguages(locale, self.langNames)
  511. if languageNames:
  512. self.sendResponse(languageNames)
  513. return
  514. self.sendResponse(getLocalizedLanguages('en', self.langNames))
  515. else:
  516. self.sendResponse(getLocalizedLanguages('en', self.langNames))
  517. else:
  518. self.sendResponse({})
  519.  
  520. class PerWordHandler(BaseHandler):
  521. @tornado.web.asynchronous
  522. @gen.coroutine
  523. def get(self):
  524. lang = toAlpha3Code(self.get_argument('lang'))
  525. modes = set(self.get_argument('modes').split(' '))
  526. query = self.get_argument('q')
  527.  
  528. if not modes <= {'morph', 'biltrans', 'tagger', 'disambig', 'translate'}:
  529. self.send_error(400, explanation='Invalid mode argument')
  530. return
  531.  
  532. def handleOutput(output):
  533. '''toReturn = {}
  534. for mode in modes:
  535. toReturn[mode] = outputs[mode]
  536. for mode in modes:
  537. toReturn[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])}
  538. for mode in modes:
  539. toReturn[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])]
  540. for mode in modes:
  541. toReturn[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']}
  542. self.sendResponse(toReturn)'''
  543.  
  544. if output is None:
  545. self.send_error(400, explanation='No output')
  546. return
  547. elif not output:
  548. self.send_error(408, explanation='Request timed out')
  549. return
  550. else:
  551. outputs, tagger_lexicalUnits, morph_lexicalUnits = output
  552.  
  553. toReturn = []
  554.  
  555. for (index, lexicalUnit) in enumerate(tagger_lexicalUnits if tagger_lexicalUnits else morph_lexicalUnits):
  556. unitToReturn = {}
  557. unitToReturn['input'] = stripTags(lexicalUnit.split('/')[0])
  558. for mode in modes:
  559. unitToReturn[mode] = outputs[mode][index]
  560. toReturn.append(unitToReturn)
  561.  
  562. if self.get_argument('pos', default=None):
  563. requestedPos = int(self.get_argument('pos')) - 1
  564. currentPos = 0
  565. for unit in toReturn:
  566. input = unit['input']
  567. currentPos += len(input.split(' '))
  568. if requestedPos < currentPos:
  569. self.sendResponse(unit)
  570. return
  571. else:
  572. self.sendResponse(toReturn)
  573.  
  574. pool = Pool(processes=1)
  575. result = pool.apply_async(processPerWord, (self.analyzers, self.taggers, lang, modes, query))
  576. pool.close()
  577.  
  578. @run_async_thread
  579. def worker(callback):
  580. try:
  581. callback(result.get(timeout=self.timeout))
  582. except TimeoutError:
  583. pool.terminate()
  584. callback(None)
  585.  
  586. output = yield tornado.gen.Task(worker)
  587. handleOutput(output)
  588.  
  589. class CoverageHandler(BaseHandler):
  590. @tornado.web.asynchronous
  591. @gen.coroutine
  592. def get(self):
  593. mode = toAlpha3Code(self.get_argument('lang'))
  594. text = self.get_argument('q')
  595. if not text:
  596. self.send_error(400, explanation='Missing q argument')
  597. return
  598.  
  599. def handleCoverage(coverage):
  600. if coverage is None:
  601. self.send_error(408, explanation='Request timed out')
  602. else:
  603. self.sendResponse([coverage])
  604.  
  605. if mode in self.analyzers:
  606. pool = Pool(processes=1)
  607. result = pool.apply_async(getCoverage, [text, self.analyzers[mode][0], self.analyzers[mode][1]])
  608. pool.close()
  609.  
  610. @run_async_thread
  611. def worker(callback):
  612. try:
  613. callback(result.get(timeout=self.timeout))
  614. except TimeoutError:
  615. pool.terminate()
  616. callback(None)
  617.  
  618. coverage = yield tornado.gen.Task(worker)
  619. handleCoverage(coverage)
  620. else:
  621. self.send_error(400, explanation='That mode is not installed')
  622.  
  623. class IdentifyLangHandler(BaseHandler):
  624. @tornado.web.asynchronous
  625. def get(self):
  626. text = self.get_argument('q')
  627. if not text:
  628. return self.send_error(400, explanation='Missing q argument')
  629.  
  630. if cld2:
  631. cldResults = cld2.detect(text)
  632. if cldResults[0]:
  633. possibleLangs = filter(lambda x: x[1] != 'un', cldResults[2])
  634. self.sendResponse({toAlpha3Code(possibleLang[1]): possibleLang[2] for possibleLang in possibleLangs})
  635. else:
  636. self.sendResponse({'nob': 100}) # TODO: Some more reasonable response
  637. else:
  638. def handleCoverages(coverages):
  639. self.sendResponse(coverages)
  640.  
  641. pool = Pool(processes=1)
  642. result = pool.apply_async(getCoverages, [text, self.analyzers], {'penalize': True}, callback=handleCoverages)
  643. pool.close()
  644. try:
  645. coverages = result.get(timeout=self.timeout)
  646. except TimeoutError:
  647. self.send_error(408, explanation='Request timed out')
  648. pool.terminate()
  649.  
  650. class GetLocaleHandler(BaseHandler):
  651. @tornado.web.asynchronous
  652. def get(self):
  653. if 'Accept-Language' in self.request.headers:
  654. locales = [locale.split(';')[0] for locale in self.request.headers['Accept-Language'].split(',')]
  655. self.sendResponse(locales)
  656. else:
  657. self.send_error(400, explanation='Accept-Language missing from request headers')
  658.  
  659. class PipeDebugHandler(BaseHandler):
  660.  
  661. @gen.coroutine
  662. def get(self):
  663. toTranslate = self.get_argument('q')
  664.  
  665. try:
  666. l1, l2 = map(toAlpha3Code, self.get_argument('langpair').split('|'))
  667. except ValueError:
  668. self.send_error(400, explanation='That pair is invalid, use e.g. eng|spa')
  669.  
  670. mode_path = self.pairs['%s-%s' % (l1, l2)]
  671. try:
  672. _, commands = translation.parseModeFile(mode_path)
  673. except Exception:
  674. self.send_error(500)
  675. return
  676.  
  677. res = yield translation.translatePipeline(toTranslate, commands)
  678. if self.get_status() != 200:
  679. self.send_error(self.get_status())
  680. return
  681.  
  682. output, pipeline = res
  683.  
  684. self.sendResponse({
  685. 'responseData': {'output': output, 'pipeline': pipeline},
  686. 'responseDetails': None,
  687. 'responseStatus': 200
  688. })
  689.  
  690. missingFreqsDb = ''
  691.  
  692. def setupHandler(port, pairs_path, nonpairs_path, langNames, missingFreqs, timeout, max_pipes_per_pair, min_pipes_per_pair, max_users_per_pipe, max_idle_secs, restart_pipe_after, verbosity=0, scaleMtLogs=False, memory=0):
  693.  
  694. global missingFreqsDb
  695. missingFreqsDb= missingFreqs
  696.  
  697. Handler = BaseHandler
  698. Handler.langNames = langNames
  699. Handler.missingFreqs = missingFreqs
  700. Handler.timeout = timeout
  701. Handler.max_pipes_per_pair = max_pipes_per_pair
  702. Handler.min_pipes_per_pair = min_pipes_per_pair
  703. Handler.max_users_per_pipe = max_users_per_pipe
  704. Handler.max_idle_secs = max_idle_secs
  705. Handler.restart_pipe_after = restart_pipe_after
  706. Handler.scaleMtLogs = scaleMtLogs
  707. Handler.inMemoryUnknown = True if memory > 0 else False
  708. Handler.inMemoryLimit = memory
  709. Handler.verbosity = verbosity
  710.  
  711. modes = searchPath(pairs_path, verbosity=verbosity)
  712. if nonpairs_path:
  713. src_modes = searchPath(nonpairs_path, include_pairs=False, verbosity=verbosity)
  714. for mtype in modes:
  715. modes[mtype] += src_modes[mtype]
  716.  
  717. [logging.info('%d %s modes found' % (len(modes[mtype]), mtype)) for mtype in modes]
  718.  
  719. for path, lang_src, lang_trg in modes['pair']:
  720. Handler.pairs['%s-%s' % (lang_src, lang_trg)] = path
  721. for dirpath, modename, lang_pair in modes['analyzer']:
  722. Handler.analyzers[lang_pair] = (dirpath, modename)
  723. for dirpath, modename, lang_pair in modes['generator']:
  724. Handler.generators[lang_pair] = (dirpath, modename)
  725. for dirpath, modename, lang_pair in modes['tagger']:
  726. Handler.taggers[lang_pair] = (dirpath, modename)
  727.  
  728. def sanity_check():
  729. locale_vars = ["LANG", "LC_ALL"]
  730. u8 = re.compile("UTF-?8", re.IGNORECASE)
  731. if not any(re.search(u8, os.environ.get(key, ""))
  732. for key in locale_vars):
  733. print("servlet.py: error: APY needs a UTF-8 locale, please set LANG or LC_ALL",
  734. file=sys.stderr)
  735. sys.exit(1)
  736.  
  737. if __name__ == '__main__':
  738. sanity_check()
  739. parser = argparse.ArgumentParser(description='Start Apertium APY')
  740. parser.add_argument('pairs_path', help='path to Apertium installed pairs (all modes files in this path are included)')
  741. parser.add_argument('-s', '--nonpairs-path', help='path to Apertium SVN (only non-translator debug modes are included from this path)')
  742. parser.add_argument('-l', '--lang-names', help='path to localised language names sqlite database (default = langNames.db)', default='langNames.db')
  743. parser.add_argument('-f', '--missing-freqs', help='path to missing frequency sqlite database (default = None)', default=None)
  744. parser.add_argument('-p', '--port', help='port to run server on (default = 2737)', type=int, default=2737)
  745. parser.add_argument('-c', '--ssl-cert', help='path to SSL Certificate', default=None)
  746. parser.add_argument('-k', '--ssl-key', help='path to SSL Key File', default=None)
  747. parser.add_argument('-t', '--timeout', help='timeout for requests (default = 10)', type=int, default=10)
  748. parser.add_argument('-j', '--num-processes', help='number of processes to run (default = 1; use 0 to run one http server per core, where each http server runs all available language pairs)', nargs='?', type=int, default=1)
  749. parser.add_argument('-d', '--daemon', help='daemon mode: redirects stdout and stderr to files apertium-apy.log and apertium-apy.err ; use with --log-path', action='store_true')
  750. parser.add_argument('-P', '--log-path', help='path to log output files to in daemon mode; defaults to local directory', default='./')
  751. parser.add_argument('-i', '--max-pipes-per-pair', help='how many pipelines we can spin up per language pair (default = 1)', type=int, default=1)
  752. parser.add_argument('-n', '--min-pipes-per-pair', help='when shutting down pipelines, keep at least this many open per language pair (default = 0)', type=int, default=0)
  753. parser.add_argument('-u', '--max-users-per-pipe', help='how many concurrent requests per pipeline before we consider spinning up a new one (default = 5)', type=int, default=5)
  754. parser.add_argument('-m', '--max-idle-secs', help='if specified, shut down pipelines that have not been used in this many seconds', type=int, default=0)
  755. parser.add_argument('-r', '--restart-pipe-after', help='restart a pipeline if it has had this many requests (default = 1000)', type=int, default=1000)
  756. parser.add_argument('-v', '--verbosity', help='logging verbosity', type=int, default=0)
  757. parser.add_argument('-S', '--scalemt-logs', help='generates ScaleMT-like logs; use with --log-path; disables', action='store_true')
  758. parser.add_argument('-M', '--unknown-memory-limit', help="keeps unknown words in memory until a limit is reached", type=int, default=0)
  759. args = parser.parse_args()
  760.  
  761. if args.daemon:
  762. # regular content logs are output stderr
  763. # python messages are mostly output to stdout
  764. # hence swapping the filenames?
  765. sys.stderr = open(os.path.join(args.log_path, 'apertium-apy.log'), 'a+')
  766. sys.stdout = open(os.path.join(args.log_path, 'apertium-apy.err'), 'a+')
  767.  
  768. logging.getLogger().setLevel(logging.INFO)
  769. enable_pretty_logging()
  770.  
  771. if args.scalemt_logs:
  772. logger = logging.getLogger('scale-mt')
  773. logger.propagate = False
  774. smtlog = os.path.join(args.log_path, 'ScaleMTRequests.log')
  775. loggingHandler = logging.handlers.TimedRotatingFileHandler(smtlog,'midnight',0)
  776. loggingHandler.suffix = "%Y-%m-%d"
  777. logger.addHandler(loggingHandler)
  778.  
  779. # if scalemt_logs is enabled, disable tornado.access logs
  780. if(args.daemon):
  781. logging.getLogger("tornado.access").propagate = False
  782.  
  783. if not cld2:
  784. logging.warning('Unable to import CLD2, continuing using naive method of language detection')
  785.  
  786. setupHandler(args.port, args.pairs_path, args.nonpairs_path, args.lang_names, args.missing_freqs, args.timeout, args.max_pipes_per_pair, args.min_pipes_per_pair, args.max_users_per_pipe, args.max_idle_secs, args.restart_pipe_after, args.verbosity, args.scalemt_logs, args.unknown_memory_limit)
  787.  
  788. application = tornado.web.Application([
  789. (r'/', RootHandler),
  790. (r'/list', ListHandler),
  791. (r'/listPairs', ListHandler),
  792. (r'/stats', StatsHandler),
  793. (r'/translate', TranslateHandler),
  794. (r'/translateDoc', TranslateDocHandler),
  795. (r'/analy[sz]e', AnalyzeHandler),
  796. (r'/generate', GenerateHandler),
  797. (r'/listLanguageNames', ListLanguageNamesHandler),
  798. (r'/perWord', PerWordHandler),
  799. (r'/calcCoverage', CoverageHandler),
  800. (r'/identifyLang', IdentifyLangHandler),
  801. (r'/getLocale', GetLocaleHandler),
  802. (r'/pipedebug', PipeDebugHandler)
  803. ])
  804.  
  805. global http_server
  806. if args.ssl_cert and args.ssl_key:
  807. http_server = tornado.httpserver.HTTPServer(application, ssl_options = {
  808. 'certfile': args.ssl_cert,
  809. 'keyfile': args.ssl_key,
  810. })
  811. logging.info('Serving at https://localhost:%s' % args.port)
  812. else:
  813. http_server = tornado.httpserver.HTTPServer(application)
  814. logging.info('Serving at http://localhost:%s' % args.port)
  815.  
  816. signal.signal(signal.SIGTERM, sig_handler)
  817. signal.signal(signal.SIGINT, sig_handler)
  818.  
  819. http_server.bind(args.port)
  820. http_server.start(args.num_processes)
  821. tornado.ioloop.IOLoop.instance().start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement