15 Oct, 2015

Project Ideas


Latent Dirichlet Allocation

Model

\[\begin{array}{rcl} x_{dw} &\sim& \text{Cat}_V(\mathbf\psi_{z_{dw}}),~ d=1...M, w=1...N_d \\\\ z_{dw} &\sim& \text{Cat}_K(\mathbf\theta_d),~ d=1...M, w=1...N_d \\\\ \mathbf\psi_k &\sim& \text{Dir}_V(\mathbf\beta),~ k=1...K \text{ topics} \\\\ \mathbf\theta_d &\sim& \text{Dir}_K(\mathbf\alpha),~ d=1...M \text{ documents} \\\\ \end{array}\]

where $V = \sum_{d=1}^{M}N_d$ is the total number of words.

Posteriors

\[\begin{array}{rclcl} \psi_k &|& \mathbf x &\sim& \text{Dir}( \beta+s_{1,k},..., \beta+s_{V,k}),~ k=1...K \\\\ \theta_d &|& \mathbf Z &\sim& \text{Dir}(\alpha+t_{1,d},...,\alpha+t_{K,d}),~ d=1...M \\\\ \end{array}\]

where $s_{i,k}$ is the number of times the $i^{th}$ word in the vocabulary has topic $k$ (throughout all documents). And $t_{j,d}$ is the number of words in document $d$ with topic $j$.

R Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
### lda.R

# Let x be a list of words.
# Let M be the number of documents

dwn <- as.numeric(readLines("dat/docword.kos.txt",n=3))
M <- dwn[1] # Number of documents
V <- dwn[2] # Number of unique words in documents
nnz <- dwn[3] # Total num of words in all documents

dat <- read.table("dat/docword.kos.txt",skip=3)
colnames(dat) <- c("doc","wid","count")

vocab <- readLines("dat/vocab.kos.txt")

x <- as.list(1:M) 
N <- NULL
u <- NULL

for (d in 1:M) {
  x[[d]] <- dat[which(dat[,1] == d),1:3]
  N[d] <- sum(x[[d]][,3])
  u[d] <- nrow(x[[d]])
}

K <- 5     # Number of topics preset to 5

# GIBBS
rdir <- function(N,a) {
  k <- length(a)
  x <- matrix(rgamma(k*N, a, 1), N, k, byrow=T)
  rowsums <- x %*% rep(1,k)
  x / as.vector(rowsums)
}

B <- 1000

# Preallocate
psi <- as.list(1:B); psi[[1]] <- matrix(1/V,K,V)
theta <- as.list(1:B); theta[[1]] <- matrix(1/K,M,K)
z <- x
for (d in 1:M) {
  nd <- z[[d]][,3]
  z[[d]] <- cbind(z[[d]][,-3], 
                  t(apply(matrix(nd),1,function(x) rmultinom(1,x,rep(1,K)) )))
}
al <- matrix(.1,B,K)
be <- matrix(.001,B,V)


word <- z[[d]][w,2]

temp <- sapply(z, function(zz) {
  if (word %in% zz[,2]) {
    ind <- which(zz[,2] == word) 
    out <- zz[ind, 2+k]
  } else {
    out <- 0
  }
  out
})


for (b in 2:B) {
  # Update:
  for (d in 1:M) {
    for (w in 1:u[d]) {
      for (k in 1:K) {
        word <- z[[d]][w,2]
        num_of_wd_w_with_topic_k <- sum(sapply(z, function(zz) {
          if (word %in% zz[,2]) {
            ind <- which(zz[,2] == word) 
            out <- zz[ind, 2+k]
          } else {
            out <- 0
          }
          out
        }))
        be[b,] <- be[b-1,] + num_of_wd_with_topic_k_and_wd_w
        psi[[b]][k,] <- rdir(1, be[b,])
      } # k in 1:K
    } # w in 1:N[d]
    al[b,] <- al[b-1,] + sum(z == k)
    theta[[b]][d,] <- rdir(1, be[b,])
  } # d in 1:M
} # b in 1:B

HDP Tutorial & Applications (towards the end) by Michael Jordan