Source code for core.matching.lda
import numpy as np
from core.lda_engine import models
from core.helper.tables import get_database
[docs]def score(paper_vec, author_vec, method):
"""
Scores a paper-author match.
:param paper_vec: the vector to be matched
:param author_vec: the vector to be scored against (usually it is the vector of an author)
:param method: the name of scoring implementation
:return: a scalar measuring the score of the match
"""
def euclidean_distance(p, q):
diff = p - q
sum_of_squares = sum([pow(i, 2) for i in diff])
if sum_of_squares == 0:
return 0
else:
return 1 / sum_of_squares
def js(p, q):
"""Computes the Jensen-Shannon divergence between two probability distributions.
:param p: array-like probability distributions of equal length that sum to 1.
:param q: same as P.
:return: the Jensen-Shannon divergence
Note: from https://stackoverflow.com/questions/15880133/jensen-shannon-divergence
"""
def _kldiv(a, b):
return np.sum([v for v in a * np.log2(a / b) if not np.isnan(v)])
p = np.array(p)
q = np.array(q)
M = 0.5 * (p + q)
return 0.5 * (_kldiv(p, M) + _kldiv(q, M))
def default(p, q):
"""Takes the dot product of the paper's topic vector and the author's vector.
:param p: vector of the paper
:param q: vector of a scholar in the author's lib.
:return: the score
"""
return np.dot(p, q) - 1
methods = {"euclidean": euclidean_distance,
"js": js,
"default": default}
return methods[method](paper_vec, author_vec)
[docs]def match_by_lda(text, model_name, top=50, detailed=True, scoring_impl="default", base=0):
"""Gives the best matching result given a string of raw text.
:param text: The text to be matched.
:param model_name: The name of the LDA model to be used.
:param top: the maximum number of results to be returned.
:param detailed: return a detailed result. It should always be True unless it is used outside the web app.
:param scoring_impl: the scoring implementation to be used.
:param base: The initial value of the vector.
:return: the matched result in dictionary form if detailed=True.
Otherwise it will return a matrix with author id and the score.
"""
print("Matching by LDA")
def update_author_vector(vec, doc_vec):
vec = [i for i in vec] # weird, why on earth this could be immutable and I have to create a copy?
for topic_id, confidence in zip(doc_vec['topic_id'], doc_vec['confidence']):
vec[topic_id] = vec[topic_id] + confidence
return vec
model = models[model_name]
paper_topics = model.predict(text)
paper_vec = np.array([base for i in range(model.num_topics)])
paper_vec = update_author_vector(paper_vec, paper_topics)
results = []
for author_id_ in model.authors_lib:
results.append([author_id_, score(model.authors_lib[author_id_], paper_vec, scoring_impl)])
results.sort(key=lambda tup: tup[1], reverse=True)
print("Base:", base)
print("paper_vec: ")
print(paper_vec)
matched_topics = [(model.get_topic_in_string(i), v) for i, v in enumerate(paper_vec) if v > base]
matched_topics.sort(key=lambda tup: tup[1], reverse=True)
print("matched_topics:")
print(matched_topics)
if detailed:
return detailed_results(results[:top], model_name), matched_topics
else:
return results[:top], matched_topics
[docs]def detailed_results(results, model_name):
"""Retrieves matched author (aka reviewers) details.
:param results: a dictionary generated by match_by_lda with detailed=True
:param model_name: the name of the current model
:return: a dictionary of author details. You'll see an example output when you run on the demo model.
"""
author_, documents_, k_a, d_a = get_database(model_name)
authors = {}
for result in results:
author_id = int(result[0])
score = result[1]
profile = author_.ix[author_id].to_dict()
profile['id'] = int(profile['id'])
keywords = list(k_a[k_a.authors_id == author_id].keyword.values)
documents_id = d_a[d_a.authors_id == author_id].documents_id.values
documents = []
for document_id in documents_id:
document = documents_.ix[document_id].to_dict()
document['id'] = int(document['id'])
try:
# Optional data fields
document['source_id'] = int(document['source_id'])
except KeyError:
pass
documents.append(document)
author = {'score': score, 'profile': profile, 'keywords': keywords, 'documents': documents}
authors[author_id] = author
if len(authors) == 1: # For demo
print("Demo: authors dictionary sample.")
print(authors)
# Note that unlike dict in python, json does not allow integer keys
return authors