Source code for core.lda_engine
import json
import pickle
import numpy as np
from flask import current_app
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from textblob import TextBlob
from app import app
from app.utils.environment import load_env
[docs]class LdaModelWrapper:
def __init__(self, filename, force_load=False, np=True, keep_state=True):
"""Initializes a Gensim LDA model.
:param filename: The base file name of the model.
:param force_load: Force the LDA model to be loaded in the memory.
:param np: Determine if the model is trained with noun phrases or individual tokens.
True for noun phrases, False for individual tokens.
Default: False
:param keep_state: Keep the state in the memory.
Default: False.
"""
def load_author_lib():
try:
return json.load(open('trained/' + filename + ".json", "rb"))
except IOError:
return pickle.load(open('trained/' + filename + ".pkl", "rb"))
self.filename = filename
self.use_noun_phrases = np # TODO: let user define if a model is trained with noun phrases
with app.app_context():
if not current_app.config["LAZYLOAD_LDA"] or force_load:
self.model = LdaModel.load('trained/' + filename)
if not keep_state:
self.model.state = None # Dispose internal state to save memory
self.num_topics = self.model.num_topics
self.num_terms = self.model.num_terms
self.authors_lib = load_author_lib()
self.dictionary = Dictionary.load('trained/' + filename + ".dictionary")
try:
self.html = open('trained/' + filename + ".html").read()
# TODO: maybe implement a visualization by pyLDAvis
except IOError:
self.html = None
print("LDA model loaded: " + filename + ", " + str(self.num_topics) + " topics.")
else:
print("Skipped LDA model preload: " + filename)
[docs] def tokenize(self, text):
"""Turns a pure text to a bag of words using the dictionary of a trained LDA model.
:param text: Raw text string.
:return: A bag of words.
"""
if self.use_noun_phrases:
tokenized = TextBlob(text.lower()).noun_phrases
else:
tokenized = TextBlob(text.lower()).words
print(tokenized)
return self.dictionary.doc2bow(tokenized)
[docs] def predict(self, text):
"""Predicts topics from a raw text string.
:param text: Raw text string.
:return: a NumPy array of topics IDs and their confidence levels.
"""
if not models:
self.__init__(self.filename, force_load=True)
vec = self.tokenize(text)
print("BoW:")
print(vec)
topics = np.array(self.model[vec], dtype=[('topic_id', int), ('confidence', float)])
topics[::-1].sort(order="confidence")
# This may seem super weird, but it works and it is actually more efficient
# see https://stackoverflow.com/questions/26984414/efficiently-sorting-a-numpy-array-in-descending-order
print(topics)
return topics
[docs] def get_author_top_topics(self, author_id, top=10):
"""Generates the top N relevant topics of an author in our database.
:param author_id: the author's ID in our database.
:param top: Number of topics to be returned.
:return: a NumPy array of topics IDs and their confidence levels.
"""
try:
author = self.authors_lib[author_id]
except KeyError:
author = self.authors_lib[str(author_id)]
top_topics = []
for topic_id, confidence in enumerate(author):
if confidence > 1:
top_topics.append([topic_id, confidence - 1])
top_topics.sort(key=lambda tup: tup[1], reverse=True)
return top_topics[:top]
[docs] def get_topic_in_list(self, topic_id):
"""Given a topic ID in the model, generates a list of terms.
:param topic_id: The topic's ID in the model.
:return: A list of terms.
"""
return [term.strip().split('*') for term in self.model.print_topic(topic_id).split("+")]
[docs] def get_topic_in_string(self, topic_id, top=5):
"""Given a topic ID in the model, generates a string representation of that topic.
:param topic_id: The topic's ID in the model.
:param top: Top N relevant terms.
:return: A string representation of the topic.
"""
topic_list = self.get_topic_in_list(topic_id)
topic_string = " / ".join([i[1] for i in topic_list][:top])
return topic_string
[docs] def get_topics_in_string(self, topics, confidence=False):
"""Converts a list of topics (with or without confidence levels) to a list of strings encoded in a dict.
:param topics: The list of topics to be converted.
:param confidence: If the input topics contains confidence levels, make sure this is set to True.
:return: a list of dictionary that includes string representations (or with confidence levels)
"""
if confidence:
topics_list = []
for topic in topics:
topic_map = {
"topic_id": topic[0],
"string": self.get_topic_in_string(topic[0]),
"confidence": topic[1]
}
topics_list.append(topic_map)
else:
topics_list = []
for topic_id in topics:
topic_map = {
"topic_id": topic_id,
"string": self.get_topic_in_string(topic_id),
}
topics_list.append(topic_map)
return topics_list
model_files = load_env("lda_models.env")
models = {model_name: LdaModelWrapper(model_files[model_name]) for model_name in model_files}
print(models)