Source code for core.matching.lda

import numpy as np
from core.lda_engine import models

from core.helper.tables import get_database


[docs]def score(paper_vec, author_vec, method):
    """
    Scores a paper-author match.
    
    :param paper_vec: the vector to be matched
    
    :param author_vec: the vector to be scored against (usually it is the vector of an author)
    
    :param method: the name of scoring implementation
    
    :return: a scalar measuring the score of the match
    """
    def euclidean_distance(p, q):
        diff = p - q
        sum_of_squares = sum([pow(i, 2) for i in diff])
        if sum_of_squares == 0:
            return 0
        else:
            return 1 / sum_of_squares

    def js(p, q):

        """Computes the Jensen-Shannon divergence between two probability distributions.
        
        :param p: array-like probability distributions of equal length that sum to 1.
        
        :param q: same as P.
        
        :return: the Jensen-Shannon divergence
        
        Note: from https://stackoverflow.com/questions/15880133/jensen-shannon-divergence
        """
        def _kldiv(a, b):
            return np.sum([v for v in a * np.log2(a / b) if not np.isnan(v)])

        p = np.array(p)
        q = np.array(q)

        M = 0.5 * (p + q)

        return 0.5 * (_kldiv(p, M) + _kldiv(q, M))

    def default(p, q):
        """Takes the dot product of the paper's topic vector and the author's vector.
        
        :param p: vector of the paper
        
        :param q: vector of a scholar in the author's lib.
        
        :return: the score
        """
        return np.dot(p, q) - 1

    methods = {"euclidean": euclidean_distance,
               "js": js,
               "default": default}

    return methods[method](paper_vec, author_vec)


[docs]def match_by_lda(text, model_name, top=50, detailed=True, scoring_impl="default", base=0):
    """Gives the best matching result given a string of raw text.
    
    :param text: The text to be matched.
    
    :param model_name: The name of the LDA model to be used.
    
    :param top: the maximum number of results to be returned.
    
    :param detailed: return a detailed result. It should always be True unless it is used outside the web app.
    
    :param scoring_impl: the scoring implementation to be used.
    
    :param base: The initial value of the vector.
    
    :return: the matched result in dictionary form if detailed=True. 
             Otherwise it will return a matrix with author id and the score. 
    """
    print("Matching by LDA")

    def update_author_vector(vec, doc_vec):
        vec = [i for i in vec]  # weird, why on earth this could be immutable and I have to create a copy?
        for topic_id, confidence in zip(doc_vec['topic_id'], doc_vec['confidence']):
            vec[topic_id] = vec[topic_id] + confidence
        return vec

    model = models[model_name]
    paper_topics = model.predict(text)
    paper_vec = np.array([base for i in range(model.num_topics)])
    paper_vec = update_author_vector(paper_vec, paper_topics)
    results = []
    for author_id_ in model.authors_lib:
        results.append([author_id_, score(model.authors_lib[author_id_], paper_vec, scoring_impl)])
    results.sort(key=lambda tup: tup[1], reverse=True)
    print("Base:", base)
    print("paper_vec: ")
    print(paper_vec)
    matched_topics = [(model.get_topic_in_string(i), v) for i, v in enumerate(paper_vec) if v > base]
    matched_topics.sort(key=lambda tup: tup[1], reverse=True)
    print("matched_topics:")
    print(matched_topics)
    if detailed:
        return detailed_results(results[:top], model_name), matched_topics
    else:
        return results[:top], matched_topics


[docs]def detailed_results(results, model_name):
    """Retrieves matched author (aka reviewers) details.
    
    :param results: a dictionary generated by match_by_lda with detailed=True
    
    :param model_name: the name of the current model
    
    :return: a dictionary of author details. You'll see an example output when you run on the demo model.
    
    """

    author_, documents_, k_a, d_a = get_database(model_name)
    authors = {}
    for result in results:
        author_id = int(result[0])
        score = result[1]
        profile = author_.ix[author_id].to_dict()
        profile['id'] = int(profile['id'])
        keywords = list(k_a[k_a.authors_id == author_id].keyword.values)
        documents_id = d_a[d_a.authors_id == author_id].documents_id.values
        documents = []
        for document_id in documents_id:
            document = documents_.ix[document_id].to_dict()
            document['id'] = int(document['id'])
            try:
                # Optional data fields
                document['source_id'] = int(document['source_id'])
            except KeyError:
                pass
            documents.append(document)
        author = {'score': score, 'profile': profile, 'keywords': keywords, 'documents': documents}
        authors[author_id] = author
        if len(authors) == 1:  # For demo
            print("Demo: authors dictionary sample.")
            print(authors)
        # Note that unlike dict in python, json does not allow integer keys
    return authors
Source code for core.matching.lda

Navigation

Related Topics