-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyse_query.py
113 lines (94 loc) · 3.95 KB
/
analyse_query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import spacy
import joblib
import pickle
import nltk
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from vectorizer_pipeline import tokenize
nltk.download('popular', quiet=True)
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
currpath = Path(__file__).parent
RESULT_LIMIT = 10
QUERY_EXPN_LIMIT = 20
vectorizer = joblib.load(currpath / 'DataFiles/vectorizer.joblib')
tfidfs = joblib.load(currpath / 'DataFiles/tfidf.joblib')
df = pd.read_pickle(currpath / 'DataFiles/dataFrame_bk.pkl')
page_rank = pickle.load(open(currpath / 'DataFiles/page_rank.pkl', 'rb'))
def analyse_query(query, n=10, page_rank_flag=False):
RESULT_LIMIT = int(n)
print(query)
q_tfidf = vectorizer.transform([query])
print(q_tfidf.shape, tfidfs.shape)
dict_cossim = {}
for i in range(len(df)):
int_dict = {}
int_dict = {'Link': df.loc[i]['Link'],
'Content': df.loc[i]['Doc'],
'CosSim': cosine_similarity(tfidfs[i], q_tfidf),
'PageRank': page_rank[df.loc[i]['Link']]
}
dict_cossim[i] = int_dict
cosine_similarity(tfidfs[0], q_tfidf)
top_n = sorted(dict_cossim.keys(),
key=lambda x: dict_cossim[x]['CosSim'][0],
reverse=True)[:RESULT_LIMIT]
top_n_page_rank = sorted(dict_cossim.keys(),
key=lambda x: (
( dict_cossim[x]['CosSim'][0]) + (dict_cossim[x]['PageRank'])),
reverse=True)[:RESULT_LIMIT]
link_list = []
doc_list = []
cossim_list = []
pagerank_list = []
if not page_rank_flag:
print("Cosssim")
for i in top_n:
#print(dict_cossim[i]['Link'], dict_cossim[i]['CosSim'][0])
link_list.append(dict_cossim[i]['Link'])
doc_list.append(dict_cossim[i]['Content'])
cossim_list.append(dict_cossim[i]['CosSim'])
pagerank_list.append(dict_cossim[i]['PageRank'])
elif page_rank_flag:
print("Page Rank")
for i in top_n_page_rank:
link_list.append(dict_cossim[i]['Link'])
doc_list.append(dict_cossim[i]['Content'])
cossim_list.append(dict_cossim[i]['CosSim'])
pagerank_list.append(dict_cossim[i]['PageRank'])
# Calculating the Query terms to expand
relevant_docs_sum = np.sum(tfidfs[top_n], axis=0)
irrelevant = np.subtract(np.sum(tfidfs, axis=0),
relevant_docs_sum)
alpha, beta, gamma = 1, 0.75, 0.15
nr, d_nr = (10, tfidfs.shape[0] - 10)
query_m = q_tfidf + (beta * relevant_docs_sum / nr) - \
(gamma * irrelevant / d_nr)
yy = np.asarray(query_m).flatten()
indices = np.argpartition(yy, -20)[-20:]
sortedindices = indices[np.argsort(yy[indices])][::-1]
features = vectorizer.get_feature_names()
queries_terms = [features[i]
for i in sortedindices if features[i] not in query]
def filter_query_terms(query_list):
# Try to get non repeated queries. Example:,
# Ignores "list" if "list index" is present in the array
result = []
for q in query_list:
if not any([q in res for res in query_list if q != res]):
result.append(q)
return result
res_filtered = filter_query_terms(queries_terms)
final_qe_terms = []
# Attach results to the query searched
# this is discarded as it causes results to be skewed
for i in res_filtered:
qexpand = query.split() + i.split()
final_qe_terms.append(
" " .join(sorted(set(qexpand), key=qexpand.index)))
print(link_list)
print(queries_terms)
return (link_list, doc_list, cossim_list, pagerank_list, res_filtered)