1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 # Source: gensim_preprocess.py import json import os JSONS_DIR = "/path/to/jsons/dir" TEXTS_DIR = "/path/to/texts/dir" for fn in os . listdir ( JSONS_DIR ): print "Converting JSON: %s " % ( fn ) fjson = open ( os . path . join ( JSONS_DIR , fn ), 'rb' ) data = json . load ( fjson ) fjson . close () tfn = os . path . splitext ( fn )[ 0 ] + ".txt" ftext = open ( os . path . join ( TEXTS_DIR , tfn ), 'wb' ) ftext . write ( data [ "text" ] . encode ( "utf-8" )) ftext . close ()

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 # Source: bow_model.py import logging import os import nltk import gensim def iter_docs ( topdir , stoplist ): for fn in os . listdir ( topdir ): fin = open ( os . path . join ( topdir , fn ), 'rb' ) text = fin . read () fin . close () yield ( x for x in gensim . utils . tokenize ( text , lowercase = True , deacc = True , errors = "ignore" ) if x not in stoplist ) class MyCorpus ( object ): def __init__ ( self , topdir , stoplist ): self . topdir = topdir self . stoplist = stoplist self . dictionary = gensim . corpora . Dictionary ( iter_docs ( topdir , stoplist )) def __iter__ ( self ): for tokens in iter_docs ( self . topdir , self . stoplist ): yield self . dictionary . doc2bow ( tokens ) logging . basicConfig ( format = ' %(asctime)s : %(levelname)s : %(message)s ' , level = logging . INFO ) TEXTS_DIR = "/path/to/texts/dir" MODELS_DIR = "/path/to/models/dir" stoplist = set ( nltk . corpus . stopwords . words ( "english" )) corpus = MyCorpus ( TEXTS_DIR , stoplist ) corpus . dictionary . save ( os . path . join ( MODELS_DIR , "mtsamples.dict" )) gensim . corpora . MmCorpus . serialize ( os . path . join ( MODELS_DIR , "mtsamples.mm" ), corpus )

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 # Source: lsi_model.py import logging import os import gensim logging . basicConfig ( format = ' %(asctime)s : %(levelname)s : %(message)s ' , level = logging . INFO ) MODELS_DIR = "/path/to/models/dir" dictionary = gensim . corpora . Dictionary . load ( os . path . join ( MODELS_DIR , "mtsamples.dict" )) corpus = gensim . corpora . MmCorpus ( os . path . join ( MODELS_DIR , "mtsamples.mm" )) tfidf = gensim . models . TfidfModel ( corpus , normalize = True ) corpus_tfidf = tfidf [ corpus ] # project to 2 dimensions for visualization lsi = gensim . models . LsiModel ( corpus_tfidf , id2word = dictionary , num_topics = 2 ) # write out coordinates to file fcoords = open ( os . path . join ( MODELS_DIR , "coords.csv" ), 'wb' ) for vector in lsi [ corpus ]: if len ( vector ) != 2 : continue fcoords . write ( " %6.4f \t %6.4f

" % ( vector [ 0 ][ 1 ], vector [ 1 ][ 1 ])) fcoords . close ()

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 # Source: num_topics.py import os import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans MODELS_DIR = "/path/to/models/dir" MAX_K = 10 X = np . loadtxt ( os . path . join ( MODELS_DIR , "coords.csv" ), delimiter = " \t " ) ks = range ( 1 , MAX_K + 1 ) inertias = np . zeros ( MAX_K ) diff = np . zeros ( MAX_K ) diff2 = np . zeros ( MAX_K ) diff3 = np . zeros ( MAX_K ) for k in ks : kmeans = KMeans ( k ) . fit ( X ) inertias [ k - 1 ] = kmeans . inertia_ # first difference if k > 1 : diff [ k - 1 ] = inertias [ k - 1 ] - inertias [ k - 2 ] # second difference if k > 2 : diff2 [ k - 1 ] = diff [ k - 1 ] - diff [ k - 2 ] # third difference if k > 3 : diff3 [ k - 1 ] = diff2 [ k - 1 ] - diff2 [ k - 2 ] elbow = np . argmin ( diff3 [ 3 :]) + 3 plt . plot ( ks , inertias , "b*-" ) plt . plot ( ks [ elbow ], inertias [ elbow ], marker = 'o' , markersize = 12 , markeredgewidth = 2 , markeredgecolor = 'r' , markerfacecolor = None ) plt . ylabel ( "Inertia" ) plt . xlabel ( "K" ) plt . show ()

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 # Source: viz_topics_scatter.py import os import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans MODELS_DIR = "/path/to/models/dir" NUM_TOPICS = 5 X = np . loadtxt ( os . path . join ( MODELS_DIR , "coords.csv" ), delimiter = " \t " ) kmeans = KMeans ( NUM_TOPICS ) . fit ( X ) y = kmeans . labels_ colors = [ "b" , "g" , "r" , "m" , "c" ] for i in range ( X . shape [ 0 ]): plt . scatter ( X [ i ][ 0 ], X [ i ][ 1 ], c = colors [ y [ i ]], s = 10 ) plt . show ()

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 # Source: lda_model.py import logging import os import gensim MODELS_DIR = "/path/to/models/dir" NUM_TOPICS = 5 logging . basicConfig ( format = ' %(asctime)s : %(levelname)s : %(message)s ' , level = logging . INFO ) dictionary = gensim . corpora . Dictionary . load ( os . path . join ( MODELS_DIR , "mtsamples.dict" )) corpus = gensim . corpora . MmCorpus ( os . path . join ( MODELS_DIR , "mtsamples.mm" )) # Project to LDA space lda = gensim . models . LdaModel ( corpus , id2word = dictionary , num_topics = NUM_TOPICS ) lda . print_topics ( NUM_TOPICS )

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 2014-08-27 20:59:04,850 : INFO : topic #0 (0.200): 0.015*patient + \ 0.014*right + 0.009*left + 0.009*procedure + 0.006*well + \ 0.005*placed + 0.005*history + 0.004*skin + 0.004*normal + \ 0.004*anesthesia 2014-08-27 20:59:04,852 : INFO : topic #1 (0.200): 0.015*patient + \ 0.009*placed + 0.008*right + 0.007*normal + 0.007*left + \ 0.006*procedure + 0.005*using + 0.004*anterior + 0.004*history + \ 0.004*skin 2014-08-27 20:59:04,854 : INFO : topic #2 (0.200): 0.012*left + \ 0.010*patient + 0.010*right + 0.007*history + 0.005*well + \ 0.005*normal + 0.004*pain + 0.004*also + 0.003*without + \ 0.003*disease 2014-08-27 20:59:04,856 : INFO : topic #3 (0.200): 0.019*patient + \ 0.011*left + 0.011*normal + 0.010*right + 0.006*well + \ 0.006*procedure + 0.005*pain + 0.004*placed + 0.004*history + \ 0.004*using 2014-08-27 20:59:04,858 : INFO : topic #4 (0.200): 0.022*patient + \ 0.013*history + 0.007*mg + 0.006*normal + 0.005*pain + \ 0.005*also + 0.005*left + 0.004*right + 0.004*year + 0.004*well

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 import os import wordcloud MODELS_DIR = "models" final_topics = open ( os . path . join ( MODELS_DIR , "final_topics.txt" ), 'rb' ) curr_topic = 0 for line in final_topics : line = line . strip ()[ line . rindex ( ":" ) + 2 :] scores = [ float ( x . split ( "*" )[ 0 ]) for x in line . split ( " + " )] words = [ x . split ( "*" )[ 1 ] for x in line . split ( " + " )] freqs = [] for word , score in zip ( words , scores ): freqs . append (( word , score )) elements = wordcloud . fit_words ( freqs , width = 120 , height = 120 ) wordcloud . draw ( elements , "gs_topic_ %d .png" % ( curr_topic ), width = 120 , height = 120 ) curr_topic += 1 final_topics . close ()

Over the past couple of weeks, I've been trying different ways to gain insight into my little corpus of 2000+ Patient Health Records (PHRs). Topic modeling is one way to do it, and I've been meaning to learn gensim , a Python library for topic modeling, so I decided to use gensim to do some topic modeling on this dataset. I have used Mahout for Topic Modeling before, but my data is quite small and doesn't need the complexity of Map-Reduce (besides the objective is to learn gensim :-)). As a bonus, gensim also offers a wrapper for Mallet , another popular Java topic modeling library.This post describes my experiment. Ultimately, the insights I got were not particulary interesting, but it got me familiar with gensim. I hope you find it useful.The source format of the dataset is a collection of JSON files. My first step is to pre-process the data so that the text portion is written out into a collection of text files. In hindsight, this seems redundant since this could be merged with the next step, but it makes things a bit clearer if separated.The next step is to convert the texts to a format that gensim can use - namely a Bag of Words (BoW) representation. Gensim expects to be fed a corpus data structure, basically a list of sparse vectors. The sparse vector elements consists of (id, score) pairs, where the id is a numeric ID that is mapped to the term via a dictionary. Gensim's author has taken pains to keep its memory requirements down by using a streaming approach to build corpora and dictionaries. The iter_docs() function below implements this streaming approach.In the code below, I read the text of each file, pass the words through gensim's tokenizer and filter out stopwords (from NLTK's English stopword list) using our custom MyCorpus class. These words are used to create a dictionary and BoW corpus, which is serialized to files for use in the next step.I didn't know (and didn't have an opinion about) how many topics this corpus should yield so I decided to compute this by reducing the features to two dimensions, then clustering the points for different values of K (number of clusters) to find an optimum value. Gensim offers various transforms that allow us to project the vectors in a corpus to a different coordinate space. One such transform is the Latent Semantic Indexing (LSI) transform, which we use to project the original data to 2D.Next I clustered the points in the reduced 2D LSI space using KMeans, varying the number of clusters (K) from 1 to 10. The objective function used is the Inertia of the cluster, defined as the sum of squared differences of each point to its cluster centroid. This value is provided directly from the Scikit-Learn KMeans algorithm. Other popular functions include Distortion (Inertia divided by the number of points) or the Percentage of Variance Explained, as described on this StackOverflow page I plotted the Inertias for different values of K, then used Vincent Granville's approach of calculating the third differential to find an elbow point . The elbow point happens here for K=5 and is marked with a red dot in the graph below.I then re-ran the KMeans algorithm with K=5 and generated the clusters.which gives me clusters that look like this.I then ran the full LDA transform against the BoW corpus, with the number of topics set to 5. As in LSI, I load up the corpus and dictionary from files, then apply the transform to project the documents into the LDA Topic space. Notice that LDA and LSI are conceptually similar in gensim - both are transforms that map one vector space to another.The results are as follows. As you can see, each topic is made up of a mixture of terms. The top 10 terms from each topic is shown in the output below and covers between 69-80% of the probability space for each topic.and here is the same information shown graphically using wordclouds (generated using the code below).As I mentioned earlier, the only insight I get from the above is that the data seems to be predominantly about patients, history and procedures. But we already knew that :-).Some ideas for improvement for the future. One could be to really get aggressive about removing noise words, much more than just the NLTK English stopwords, that way we could remove terms such as "without" and "year" from the input, possibly resulting in a cleaner set of topics. Another could be to check for higher values of K, perhaps against a higher dimensional space, since we are computing the elbow analytically anyway.