Source code for core.lda_engine

import json
import pickle

import numpy as np
from flask import current_app
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from textblob import TextBlob

from app import app
from app.utils.environment import load_env


[docs]class LdaModelWrapper:
    def __init__(self, filename, force_load=False, np=True, keep_state=True):

        """Initializes a Gensim LDA model.

        :param filename: The base file name of the model.

        :param force_load: Force the LDA model to be loaded in the memory.

        :param np: Determine if the model is trained with noun phrases or individual tokens.
                   True for noun phrases, False for individual tokens.
                   Default: False

        :param keep_state: Keep the state in the memory.
                           Default: False.
        """

        def load_author_lib():
            try:
                return json.load(open('trained/' + filename + ".json", "rb"))
            except IOError:
                return pickle.load(open('trained/' + filename + ".pkl", "rb"))

        self.filename = filename
        self.use_noun_phrases = np  # TODO: let user define if a model is trained with noun phrases
        with app.app_context():
            if not current_app.config["LAZYLOAD_LDA"] or force_load:
                self.model = LdaModel.load('trained/' + filename)
                if not keep_state:
                    self.model.state = None  # Dispose internal state to save memory
                self.num_topics = self.model.num_topics
                self.num_terms = self.model.num_terms
                self.authors_lib = load_author_lib()
                self.dictionary = Dictionary.load('trained/' + filename + ".dictionary")
                try:
                    self.html = open('trained/' + filename + ".html").read()
                    # TODO: maybe implement a visualization by pyLDAvis
                except IOError:
                    self.html = None
                print("LDA model loaded: " + filename + ", " + str(self.num_topics) + " topics.")
            else:
                print("Skipped LDA model preload: " + filename)

[docs]    def tokenize(self, text):

        """Turns a pure text to a bag of words using the dictionary of a trained LDA model.

        :param text: Raw text string.

        :return: A bag of words.
        """

        if self.use_noun_phrases:
            tokenized = TextBlob(text.lower()).noun_phrases
        else:
            tokenized = TextBlob(text.lower()).words
        print(tokenized)
        return self.dictionary.doc2bow(tokenized)

[docs]    def predict(self, text):

        """Predicts topics from a raw text string.

        :param text: Raw text string.

        :return: a NumPy array of topics IDs and their confidence levels.
        """

        if not models:
            self.__init__(self.filename, force_load=True)
        vec = self.tokenize(text)
        print("BoW:")
        print(vec)
        topics = np.array(self.model[vec], dtype=[('topic_id', int), ('confidence', float)])
        topics[::-1].sort(order="confidence")
        # This may seem super weird, but it works and it is actually more efficient
        # see https://stackoverflow.com/questions/26984414/efficiently-sorting-a-numpy-array-in-descending-order
        print(topics)
        return topics

[docs]    def get_author_top_topics(self, author_id, top=10):

        """Generates the top N relevant topics of an author in our database.

        :param author_id: the author's ID in our database.

        :param top: Number of topics to be returned.

        :return: a NumPy array of topics IDs and their confidence levels.
        """
        try:
            author = self.authors_lib[author_id]
        except KeyError:
            author = self.authors_lib[str(author_id)]
        top_topics = []
        for topic_id, confidence in enumerate(author):
            if confidence > 1:
                top_topics.append([topic_id, confidence - 1])
        top_topics.sort(key=lambda tup: tup[1], reverse=True)
        return top_topics[:top]

[docs]    def get_topic_in_list(self, topic_id):

        """Given a topic ID in the model, generates a list of terms.

        :param topic_id: The topic's ID in the model.

        :return: A list of terms.
        """

        return [term.strip().split('*') for term in self.model.print_topic(topic_id).split("+")]

[docs]    def get_topic_in_string(self, topic_id, top=5):

        """Given a topic ID in the model, generates a string representation of that topic.

        :param topic_id: The topic's ID in the model.

        :param top: Top N relevant terms.

        :return: A string representation of the topic.
        """

        topic_list = self.get_topic_in_list(topic_id)
        topic_string = " / ".join([i[1] for i in topic_list][:top])
        return topic_string

[docs]    def get_topics_in_string(self, topics, confidence=False):

        """Converts a list of topics (with or without confidence levels) to a list of strings encoded in a dict.

        :param topics: The list of topics to be converted.

        :param confidence: If the input topics contains confidence levels, make sure this is set to True.

        :return: a list of dictionary that includes string representations (or with confidence levels)
        """

        if confidence:
            topics_list = []
            for topic in topics:
                topic_map = {
                    "topic_id": topic[0],
                    "string": self.get_topic_in_string(topic[0]),
                    "confidence": topic[1]
                }
                topics_list.append(topic_map)
        else:
            topics_list = []
            for topic_id in topics:
                topic_map = {
                    "topic_id": topic_id,
                    "string": self.get_topic_in_string(topic_id),
                }
                topics_list.append(topic_map)
        return topics_list

model_files = load_env("lda_models.env")
models = {model_name: LdaModelWrapper(model_files[model_name]) for model_name in model_files}
print(models)
Source code for core.lda_engine

Navigation

Related Topics