Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- #-*- coding: utf-8 -*-
- import math
- import os
- import argparse
- import commands
- import requests
- import re
- from bs4 import BeautifulSoup
- from flask import Flask, jsonify, request, redirect, url_for, render_template
- from werkzeug import secure_filename
- from nltk.corpus import stopwords
- from nltk.tokenize import word_tokenize
- word_d = {}
- sent_list = []
- def cleanStr(data):
- cleanr = re.compile('<.*?>')
- cleanText = re.sub(cleanr,'',data)
- cleanText = re.sub('[:&\;/.,")(]','',cleanText)
- return cleanText
- def process_new_sentence(s):
- sent_list.append(s)
- tokenized = word_tokenize(s.lower())
- for word in tokenized:
- if word in stopwords.words('english'):
- continue
- if word not in word_d.keys():
- word_d[word]=0
- word_d[word] += 1
- def compute_tf(s):
- bow = set()
- wordcount_d = {}
- tokenized = word_tokenize(s)
- for tok in tokenized:
- if tok in stopwords.words('english'):
- continue
- if tok not in wordcount_d.keys():
- wordcount_d[tok]=0
- wordcount_d[tok] += 1
- bow.add(tok)
- tf_d = {}
- for word, tfval in wordcount_d.iteritems():
- tf_d[word] = float(tfval) / len(bow)
- return tf_d
- def compute_idf():
- Dval = len(sent_list)
- # build set of words
- bow = set()
- for i in range(0,len(sent_list)):
- tokenized = word_tokenize(sent_list[i])
- for tok in tokenized:
- if tok in stopwords.words('english'):
- continue
- bow.add(tok)
- idf_d = {}
- for t in bow:
- cnt = 0
- for s in sent_list:
- if t in word_tokenize(s):
- cnt += 1
- idf_d[t] = math.log10(float(len(sent_list))/cnt)
- return idf_d
- UPLOAD_FOLDER="path/to/the/uploads"
- ALLOWED_EXTENSIONS=set(['txt'])
- app=Flask(__name__)
- app.config['UPLOAD_FOLDER']=UPLOAD_FOLDER
- def allowed_file(filename):
- return '.' in filename and \
- filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
- @app.route('/', methods=['GET'])
- #Here there is no function in main page :D
- def main_page():
- return render_template('main_page.html')
- @app.route('/url_checker1', methods=['POST', 'GET'])
- def checker1():
- if request.method=='POST':
- url=request.form['input']
- res=requests.get(url)
- soup=BeautifulSoup(res.content, 'html.parser')
- price=soup.find(id='qwidget_lastsale').text
- price = cleanStr(price)
- result = {}
- for sent in price.split('\n'):
- process_new_sentence(sent)
- idf_d = compute_idf()
- for i in range(0,len(sent_list)):
- tf_d = compute_tf(sent_list[i])
- for word,tfval in tf_d.iteritems():
- if word in result.keys():
- if result[word] < tfval*idf_d[word]:
- result[word] = tfval*idf_d[word]
- else:
- result[word] = tfval*idf_d[word]
- Last = sorted(result.items(), key = lambda x: x[1], reverse = True)
- cnt = 0
- for i in wordL:
- print "%-15s%10d" % (i[0], i[1])
- cnt += 1
- if cnt == 10:
- break
- return render_template('checker1.html',price=price)
- @app.route('/url_checker2', methods=['POST', 'GET'])
- def checker2():
- if request.method == 'POST':
- f = request.files['file']
- f.save(secure_filename(f.filename))
- contents = ""
- with open(f.filename, 'r') as f:
- content = f.readlines()
- content = [x.strip() for x in content]
- return content[0]
- if __name__=='__main__':
- try:
- parser=argparse.ArgumentParser(description="")
- parser.add_argument('--listen-port', type=str, required=True, help='REST service listen port')
- args=parser.parse_args()
- listen_port=args.listen_port
- except Exception, e:
- print('Error: %s' % str(e))
- ipaddr=commands.getoutput("hostname -I").split()[0]
- print "Starting the service with ip_addr"+ipaddr
- app.run(debug=False, host=ipaddr, port=int(listen_port))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement