Код:
import numpy as np
import itertools
import requests
import re
import io
from cleantext import clean
from lxml import html
from razdel import sentenize
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
pages = [
'https://owen.ru/forum/archive/index.php/t-26216.html',
'https://owen.ru/forum/archive/index.php/t-26216-p-1.html',
'https://owen.ru/forum/archive/index.php/t-26216-p-2.html',
'https://owen.ru/forum/archive/index.php/t-26216-p-3.html',
'https://owen.ru/forum/archive/index.php/t-26216-p-4.html',
'https://owen.ru/forum/archive/index.php/t-26216-p-5.html',
'https://owen.ru/forum/archive/index.php/t-26216-p-6.html',
'https://owen.ru/forum/archive/index.php/t-26216-p-7.html'
]
def word_tokenize(text):
text = text.lower()
regexes = [
(re.compile(u'ё'),'е'),
(re.compile(r'([?!\.])'), r' \1 '),
(re.compile(r'([\]\[\(\)\{\}\<\>])'), r' \1 '),
(re.compile(r'([\(\)])'),''),
(re.compile(r'([\.,:-])'),''),
(re.compile(u' *'),' '),
(re.compile(u'confused'),'')
]
text = " " + text + " "
for regexp, substitution in regexes:
text = regexp.sub(substitution, text)
return text
def indexer():
texts = list()
names = list()
dates = list()
for url in pages:
page = requests.get(url)
tree = html.fromstring(page.content)
posts = tree.xpath('//div[@class="post"]')
for unit in posts:
if url == pages[-1] and unit == posts[-1]:
query = word_tokenize( unit.xpath('.//div[@class="posttext"]/text()')[0] )
post = unit.xpath('.//div[@class="posttext"]/text()')
texts += [ word_tokenize( post[0] )]
names += unit.xpath('.//div[@class="username"]/text()')
dates += unit.xpath('.//div[@class="date"]/text()')
return names, dates, texts, [vector.astype(float).tolist() for vector in embedder.encode(texts)], query
def search(engine, text, sentences, names, dates):
indices = engine.kneighbors(embedder.encode([text])[0].astype(float).reshape(1, -1),return_distance=True)
distance = indices[0][0][0]
position = indices[1][0][0]
print('Релевантность "%.3f' % (1 - distance / 2),'Фраза: %s", Ник "%s", Дата "%s"' % (sentences[position], names[position], dates[position]))
if position < (len(names) - 1):
print('Последующий пост: %s", Ник "%s", Дата "%s"' % (sentences[position+1], names[position+1], dates[position+1]))
names, dates, sentences, vectors, query = indexer()
engine = NearestNeighbors(n_neighbors=1, metric='cosine').fit(np.array(vectors).reshape(len(vectors), -1))
query = 'посоветуйте как лучше сделать схему на форуме не нашел'
search(engine, query, sentences, names, dates)