Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class Indexer:
- def __init__(self):
- self.conn = psycopg2.connect("dbname='postgres' user='postgres' password='120789' host='localhost'")
- self.cur = self.conn.cursor()
- self.cur.execute("set search_path to 'rambler_search'")
- self.grab = Grab()
- def __del__(self):
- self.conn.close()
- def dbcommit(self):
- self.conn.commit()
- # Начинаем со списка страниц для индексации
- # опускаемся в глубину на 2 уровня
- def crawl(self, pages, depth=2):
- rambler_news_href_pattern = re.compile(r'(^http:\/\/news\.rambler\.ru\/[\d]+)')
- for i in range(depth):
- newpages={}
- for page in pages:
- try:
- self.grab.go(page)
- except:
- print "Could not open %s" % page
- continue
- try:
- article_text = '' # текст статьи
- for paragraph in self.grab.tree.xpath("//p[contains(@class, 'b-article__paragraph')]"):
- article_text += paragraph.text_content()
- self.addtoindex(page, article_text) # записываем в базу текст статьи с индексацией
- links = self.grab.tree.xpath("//a")
- for link in links:
- if ('href' in link.attrib):
- url = urljoin(page, link.attrib['href']).split('#')[0]# делаем полную ссылку и удаляем часть за # если она есть
- match = rambler_news_href_pattern.findall(url)
- if match:
- url = match[0]
- if url[0:4] == 'http' and not self.isindexed(url):
- newpages[url] = 1
- linkText = link.text_content() # текст ссылки
- self.addlinkref(page, url, linkText) # записываем в базу текст ссылки с индексацией
- self.dbcommit()
- except Exception, e:
- print "Could not parse page %s" % page, e
- pages = newpages
- def getwords(html):
- words = []
- for split_str in html.split():
- t = re.split("[\s;:\-_*\".,?!()'&#«»]", split_str)
- words += [a.lower() for a in t if a != '' and len(a) > 3]
- return words
- def addtoindex(self, url, text):
- if self.isindexed(url): return # если уже индексирована - пропускаем
- print 'Indexing %s' % url
- # Получаем слова из текста
- words = getwords(text)
- # Получаем id url'а
- url_id = self.getentryid('url_list', 'url', url)
- # Связываем слова с этим урлом
- for i, word in enumerate(words): # пронумеруем слова
- word_id = self.getentryid('word_list', 'word', word)
- self.cur.execute("insert into word_location(url_id, word_id, location) values (%d, %d, %d)" % (url_id, word_id, i))
- self.dbcommit()
- # Узнаем id записи в БД, если нет
- # иначе записываем и возвращаем новый
- def getentryid(self, table, field, value, createnew=True):
- self.cur.execute("select id from %s where %s = '%s'" % (table, field, value))
- cur = self.cur.fetchone()
- if not cur:
- # print (table, field, value)
- self.cur.execute("insert into %s (%s) values ('%s') returning %s.id" % (table, field, value, table))
- cur = self.cur.fetchone()
- self.dbcommit()
- return cur[0]
- else:
- return cur[0]
- # Возвращаем True, если посещали страницу
- def isindexed(self, url):
- self.cur.execute("select id from url_list where url = '%s'" % url)
- u = self.cur.fetchone()
- if u:
- # Проверяем, что паук посещал страницу
- self.cur.execute("select count(1) from word_location where url_id = %d" % u[0])
- v = self.cur.fetchone()
- if v[0]:
- return True
- return False
- def addlinkref(self, urlFrom, urlTo, linkText):
- words = getwords(linkText)
- from_id = self.getentryid('url_list', 'url', urlFrom)
- to_id = self.getentryid('url_list', 'url', urlTo)
- if from_id == to_id: return
- self.cur.execute("insert into link(from_id, to_id) values (%d, %d) returning link.id" % (from_id, to_id))
- link_id = self.cur.fetchone()[0]
- for word in words:
- word_id = self.getentryid('word_list', 'word', word)
- self.cur.execute("insert into link_words(link_id, word_id) values (%d, %d)" % (link_id, word_id))
- def get_match_rows(self, query):
- select_query_add = []
- join_query_add = []
- where_query_add = []
- main_search_query = """
- SELECT wl0.url_id, %s
- FROM word_location wl0
- %s
- WHERE %s
- """
- query_words = getwords(query)
- query_word_ids = []
- for query_word in query_words:
- self.cur.execute("select id from word_list where word = '%s'" % query_word)
- query_word_id = self.cur.fetchone()
- if query_word_id:
- query_word_ids.append(query_word_id[0])
- if query_word_ids:
- for position, query_word_id in enumerate(query_word_ids):
- if position:
- join_query_add.append('JOIN word_location wl%d ON wl%d.url_id = wl%d.url_id' % (position, position-1, position))
- select_query_add.append('wl%d.location' % position)
- where_query_add.append('wl%d.word_id = %d' % (position, query_word_id))
- main_search_query = main_search_query % (', '.join(select_query_add), ' '.join(join_query_add), ' and '.join(where_query_add))
- self.cur.execute(main_search_query)
- search_results = self.cur.fetchall()
- return search_results, query_word_ids
- # Функция, которая взвешивает результаты
- def get_scored_list(self, rows, word_ids):
- total_scores = {row[0]: 0 for row in rows}
- if rows:
- # Список весовых функций
- weight_functions = [
- ]
- for (weight, scores) in weight_functions:
- for url in total_scores:
- total_scores[url] += weight * scores[url]
- return total_scores
- # Возвращает полный урл по id
- def get_url_name(self, url_id):
- self.cur.execute("select url from url_list where id = %d" % url_id)
- return self.cur.fetchone()[0]
- # Основная функция поиска
- def search(self, search_sentence):
- search_results, word_ids = self.get_match_rows(search_sentence)
- scores = self.get_scored_list(search_results, word_ids)
- ranked_scores = [(score, url) for (url, score) in scores.items()]
- ranked_scores.sort()
- ranked_scores.reverse()
- for (score, url_id) in ranked_scores[0:10]:
- print '%f\t%s' % (score, self.get_url_name(url_id))
- return word_ids, [r[1] for r in ranked_scores[0:10]]
- def normalize_scores(self, scores, smallIsBetter=0):
- vsmall = 0.00001 # Avoid division by zero errors
- if smallIsBetter:
- minscore = min(scores.values())
- return {u: float(minscore)/max(vsmall, l) for (u,l) in scores.items()}
- else:
- maxscore = max(scores.values())
- if maxscore == 0: maxscore = vsmall
- return {u: float(c)/maxscore for (u,c) in scores.items()}
- return scores
- def frequency_score(self, rows):
- counts = {row[0]:0 for row in rows}
- for row in rows:
- counts[row[0]] += 1
- return self.normalize_scores(counts)
- def location_score(self, rows):
- locations = {}
- for row in rows:
- loc = sum(row[1:])
- if locations.has_key(row[0]):
- if loc < locations[row[0]]:
- locations[row[0]] = loc
- else:
- locations[row[0]] = loc
- return self.normalizescores(locations, smallIsBetter=1)
- def distance_score(self, rows):
- mindistance = {}
- # Если только 1 слово, любой документ выигрывает
- if len(rows[0]) <= 2:
- return {row[0]: 1.0 for row in rows}
- mindistance = {}
- for row in rows:
- dist = sum([abs(row[i]-row[i-1]) for i in xrange(2, len(row))])
- if mindistance.has_key(row[0]):
- if dist < mindistance[row[0]]:
- mindistance[row[0]] = dist
- else:
- mindistance[row[0]] = dist
- return self.normalize_scores(mindistance, smallIsBetter=1)
- def inbound_link_score(self, rows):
- unique_urls = {row[0]: 1 for row in rows}
- inbound_count = {}
- for url_id in unique_urls:
- self.cur.execute('select count(*) from link where to_id = %d' % url_id)
- inbound_count[url_id] = self.cur.fetchone()[0]
- return self.normalize_scores(inbound_count)
- # Вычисляем PageRank страниц
- def calculate_page_rank(self, iterations=20):
- START_PR = 1
- self.cur.execute('update url_list set page_rank = %d' % START_PR)
- self.db_commit()
- for i in range(iterations):
- print "Iteration %d" % (i)
- self.cur.execute('select id from url_list')
- for url_id, in self.cur.fetchall():
- # Конечный PR зависит от START_PR и этого базового pr
- pr = 0.15
- self.cur.execute('select distinct from_id from link where to_id = %d' % url_id)
- # Перебираем все страницы, ссылающиеся на данную
- for linker, in self.cur.fetchall():
- # Получаем PR ссылающейся страницы
- self.cur.execute('select page_rank from url_list where id = %d' % linker)
- linking_pr = self.cur.fetchone()[0]
- # Получаем общее количество ссылок на ссылающейся странице
- self.cur.execute('select count(*) from link where from_id = %d' % linker)
- linking_count = self.cur.fetchone()[0]
- pr += 0.85 * (linking_pr/linking_count)
- self.cur.execute('update url_list set page_rank = %f where id = %d' % (pr, url_id))
- self.db_commit()
- def page_rank_score(self, rows):
- page_ranks = {row[0]: 0 for row in rows}
- for row in rows:
- self.cur.execute('select page_rank from url_list where id = %d' % row[0])
- page_ranks[row[0]] = self.cur.fetchone()[0]
- return self.normalize_scores(page_ranks)
- def link_text_score(self, rows, word_ids):
- link_scores = {row[0]: 0 for row in rows}
- word_ids_string = ','.join(map(str, word_ids))
- url_ids_string = ','.join(set([str(row[0]) for row in rows]))
- self.cur.execute(''' select sum(u_l.page_rank), l.to_id
- from link l
- join link_words l_w on l.id = l_w.link_id
- join url_list u_l on u_l.id = l.from_id
- where word_id IN (%s) and l.to_id IN (%s)
- group by l.to_id''' % (word_ids_string, url_ids_string))
- for sum_pr_url_from, url_id in self.cur.fetchall():
- link_scores['url_id'] = sum_pr_url_from
- return self.normalize_scores(link_scores
- def set_strength(self, from_id, to_id, table, strength):
- self.cur.execute('select id from %s where from_id = %d and to_id = %d' % (table, from_id, to_id))
- res = self.cur.fetchone()
- if res == None:
- self.cur.execute('insert into %s (from_id, to_id, strength) values (%d, %d, %f)' % (table, from_id, to_id, strength))
- else:
- row_id = res[0]
- self.cur.execute('update %s set strength = %f where id = %d' % (table, strength, row_id))
- self.db_commit()
- def generate_hidden_node(self, word_ids, urls):
- # Для простоты ограничимся 2-х словными фразами
- if len(word_ids) > 3:
- return None
- # Проверить, создавали ли мы уже узел для данного набора слов
- sorted_words = map(str, word_ids)
- sorted_words.sort()
- create_key = '_'.join(sorted_words)
- self.cur.execute("select count(id) from hidden_node where create_key = '%s'" % create_key)
- count_hidden_id = self.cur.fetchone()
- # Если нет, создадим сейчас
- if not count_hidden_id:
- self.cur.execute("insert into hidden_node (create_key) values ('%s') returning hidden_node.id" % create_key)
- hidden_id = self.cur.fetchone()[0]
- # Задать веса по умолчанию
- for word_id in word_ids:
- self.set_strength(word_id, hidden_id, 'word_hidden', 1.0/len(word_ids))
- for url_id in urls:
- self.set_strength(hidden_id, url_id, 'hidden_url', 0.1)
- self.db_commit()
- def get_all_hidden_ids(self, word_ids, url_ids):
- l1 = {}
- for word_id in word_ids:
- self.cur.execute('select to_id from word_hidden where from_id = %d' % word_id)
- cur = self.cur.fetchall()
- for row in cur:
- l1[row[0]] = 1
- for url_id in url_ids:
- cur = self.cur.execute('select from_id from hidden_url where to_id = %d' % url_id)
- cur = self.cur.fetchall()
- for row in cur:
- l1[row[0]] = 1
- return l1.keys()
- def setup_network(self, word_ids, url_ids):
- # списки значений
- self.word_ids = word_ids
- self.hidden_ids = self.get_all_hidden_ids(word_ids, url_ids)
- self.url_ids = url_ids
- # выходные сигналы узлов
- self.hidden_word_output = [1.0]*len(self.word_ids)
- self.hidden_layer_output = [1.0]*len(self.hidden_ids)
- self.hidden_url_output = [1.0]*len(self.url_ids)
- # создаем матрицу весов
- self.word_layer_weights = [
- [self.get_strength(word_id, hidden_id, 0) for hidden_id in self.hidden_ids] for word_id in self.word_ids
- ]
- self.layer_url_weights = [
- [self.get_strength(hidden_id, url_id, 1) for url_id in self.url_ids] for hidden_id in self.hidden_ids
- ]
- def feed_forward(self):
- # единственные входные сигналы – слова из запроса
- for i in xrange(len(self.word_ids)):
- self.hidden_word_output[i] = 1.0
- # возбуждение скрытых узлов
- for j in xrange(len(self.hidden_ids)):
- sum = 0.0
- for i in xrange(len(self.word_ids)):
- sum = sum + self.hidden_word_output[i] * self.word_layer_weights[i][j]
- self.hidden_layer_output[j] = tanh(sum)
- # возбуждение выходных узлов
- for k in xrange(len(self.url_ids)):
- sum = 0.0
- for j in range(len(self.hidden_ids)):
- sum = sum + self.hidden_layer_output[j] * self.layer_url_weights[j][k]
- self.hidden_url_output[k] = tanh(sum)
- return self.hidden_url_output[:]
- def get_result(self, word_ids, url_ids):
- self.setup_network(word_ids, url_ids)
- return self.feed_forward()
- def back_propagate(self, targets, N=0.5):
- # вычислить поправки для выходного слоя
- output_deltas = [0.0] * len(self.urlids)
- for k in xrange(len(self.urlids)):
- error = targets[k] - self.hidden_url_output[k]
- output_deltas[k] = dtanh(self.hidden_url_output[k]) * error
- # вычислить поправки для скрытого слоя
- hidden_deltas = [0.0] * len(self.hiddenids)
- for j in xrange(len(self.hiddenids)):
- error = 0.0
- for k in xrange(len(self.urlids)):
- error = error + output_deltas[k]*self.layer_url_weights[j][k]
- hidden_deltas[j] = dtanh(self.hidden_layer_output[j]) * error
- # обновить веса связеи между узлами скрытого и выходного слоя
- for j in xrange(len(self.hiddenids)):
- for k in xrange(len(self.urlids)):
- change = output_deltas[k]*self.hidden_layer_output[j]
- self.layer_url_weights[j][k] = self.layer_url_weights[j][k] + N*change
- # обновить веса связей между узлами входного и скрытого слоя
- for i in xrange(len(self.wordids)):
- for j in xrange(len(self.hiddenids)):
- change = hidden_deltas[j]*self.hidden_word_output[i]
- self.word_layer_weights[i][j] = self.word_layer_weights[i][j] + N*change
- def train_query(self, word_ids, url_ids, selected_url_id):
- # сгенерировать скрытыи узел, если необходимо
- self.generate_hidden_node(word_ids, url_ids)
- self.setup_network(word_ids, url_ids)
- self.feed_forward()
- targets = [0.0]*len(urlids)
- targets[urlids.index(selected_url_id)] = 1.0
- error = self.back_propagate(targets)
- self.update_database()
- def update_database(self):
- for i in xrange(len(self.word_ids)):
- for j in xrange(len(self.hidden_ids)):
- self.set_strength(self.word_ids[i], self. hidden_ids[j], 0, self.word_layer_weights[i][j])
- for j in xrange(len(self.hidden_ids)):
- for k in xrange(len(self.url_ids)):
- self.set_strength(self.hidden_ids[j], self.url_ids[k], 1, self.layer_url_weights[j][k])
- def neiron_net_score(self, rows, word_ids):
- # Get unique URL IDs as an ordered list
- url_ids = [row[0] for row in rows]
- nn_res = self.neiron_net.get_result(word_ids, url_ids)
- scores = {url_ids[i]: nn_res[i] for i in xrange(len(url_ids))}
- return self.normalize_scores(scores)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement