Source code for core.matching.lda

import numpy as np
from core.lda_engine import models

from core.helper.tables import get_database


[docs]def score(paper_vec, author_vec, method): """ Scores a paper-author match. :param paper_vec: the vector to be matched :param author_vec: the vector to be scored against (usually it is the vector of an author) :param method: the name of scoring implementation :return: a scalar measuring the score of the match """ def euclidean_distance(p, q): diff = p - q sum_of_squares = sum([pow(i, 2) for i in diff]) if sum_of_squares == 0: return 0 else: return 1 / sum_of_squares def js(p, q): """Computes the Jensen-Shannon divergence between two probability distributions. :param p: array-like probability distributions of equal length that sum to 1. :param q: same as P. :return: the Jensen-Shannon divergence Note: from https://stackoverflow.com/questions/15880133/jensen-shannon-divergence """ def _kldiv(a, b): return np.sum([v for v in a * np.log2(a / b) if not np.isnan(v)]) p = np.array(p) q = np.array(q) M = 0.5 * (p + q) return 0.5 * (_kldiv(p, M) + _kldiv(q, M)) def default(p, q): """Takes the dot product of the paper's topic vector and the author's vector. :param p: vector of the paper :param q: vector of a scholar in the author's lib. :return: the score """ return np.dot(p, q) - 1 methods = {"euclidean": euclidean_distance, "js": js, "default": default} return methods[method](paper_vec, author_vec)
[docs]def match_by_lda(text, model_name, top=50, detailed=True, scoring_impl="default", base=0): """Gives the best matching result given a string of raw text. :param text: The text to be matched. :param model_name: The name of the LDA model to be used. :param top: the maximum number of results to be returned. :param detailed: return a detailed result. It should always be True unless it is used outside the web app. :param scoring_impl: the scoring implementation to be used. :param base: The initial value of the vector. :return: the matched result in dictionary form if detailed=True. Otherwise it will return a matrix with author id and the score. """ print("Matching by LDA") def update_author_vector(vec, doc_vec): vec = [i for i in vec] # weird, why on earth this could be immutable and I have to create a copy? for topic_id, confidence in zip(doc_vec['topic_id'], doc_vec['confidence']): vec[topic_id] = vec[topic_id] + confidence return vec model = models[model_name] paper_topics = model.predict(text) paper_vec = np.array([base for i in range(model.num_topics)]) paper_vec = update_author_vector(paper_vec, paper_topics) results = [] for author_id_ in model.authors_lib: results.append([author_id_, score(model.authors_lib[author_id_], paper_vec, scoring_impl)]) results.sort(key=lambda tup: tup[1], reverse=True) print("Base:", base) print("paper_vec: ") print(paper_vec) matched_topics = [(model.get_topic_in_string(i), v) for i, v in enumerate(paper_vec) if v > base] matched_topics.sort(key=lambda tup: tup[1], reverse=True) print("matched_topics:") print(matched_topics) if detailed: return detailed_results(results[:top], model_name), matched_topics else: return results[:top], matched_topics
[docs]def detailed_results(results, model_name): """Retrieves matched author (aka reviewers) details. :param results: a dictionary generated by match_by_lda with detailed=True :param model_name: the name of the current model :return: a dictionary of author details. You'll see an example output when you run on the demo model. """ author_, documents_, k_a, d_a = get_database(model_name) authors = {} for result in results: author_id = int(result[0]) score = result[1] profile = author_.ix[author_id].to_dict() profile['id'] = int(profile['id']) keywords = list(k_a[k_a.authors_id == author_id].keyword.values) documents_id = d_a[d_a.authors_id == author_id].documents_id.values documents = [] for document_id in documents_id: document = documents_.ix[document_id].to_dict() document['id'] = int(document['id']) try: # Optional data fields document['source_id'] = int(document['source_id']) except KeyError: pass documents.append(document) author = {'score': score, 'profile': profile, 'keywords': keywords, 'documents': documents} authors[author_id] = author if len(authors) == 1: # For demo print("Demo: authors dictionary sample.") print(authors) # Note that unlike dict in python, json does not allow integer keys return authors