suzanne_eng <- readLines("suzanne-cohen-eng.txt")
unigrams <- suzanne_eng %>% paste(collapse=" ") %>% 
  # tokenize by character (strsplit returns a list, so unlist it)
  strsplit(split="") %>% unlist %>% 
  # remove instances of characters you don't care about
  str_remove_all("[,.!'\"]") %>% 
  str_to_lower() %>%
  # make a frequency table of the characters
  table 
letters_space<-c(letters," ")
x<-rep(0,27);names(x)<-letters_space
for (letter in letters_space) x[letter]<-unigrams[letter]
unigrams<-x
barplot(log(unigrams/sum(unigrams,na.rm=TRUE)+1))

suzanne1<-c(" ",suzanne_eng) %>% paste(collapse="") %>% 
  # tokenize by character (strsplit returns a list, so unlist it)
  strsplit(split="") %>% unlist %>% 
  # remove instances of characters you don't care about
   str_remove_all("[,.!'\"]")  %>% 
   str_to_lower()

suzanne2<-c(suzanne_eng," ") %>% paste(collapse="") %>% 
  # tokenize by character (strsplit returns a list, so unlist it)
  strsplit(split="") %>% unlist %>% 
  # remove instances of characters you don't care about
  str_remove_all("[,.!'\"]")  %>% 
  str_to_lower()

bigrams<-table(suzanne2,suzanne1)

X<-matrix(0,27,27)
row.names(X)<-letters_space
colnames(X)<-letters_space
for (letteri in letters_space)
  for (letterj in letters_space)
    if ((letteri %in% row.names(bigrams))&&(letterj %in% row.names(bigrams))) X[letteri,letterj]<-bigrams[letteri,letterj]

bigrams<-X

Estimation of stationnary distribution

A <- matrix(c(0, 1/2,1,1,0,0,0,1/2,0),3,3)
## Solution 1
eigen.res<-eigen(t(A))
pi<-eigen.res$vectors[,1]
t(pi)%*%A

##               [,1]          [,2]          [,3]
## [1,] -0.6666667+0i -0.6666667+0i -0.3333333+0i

## Solution 2
M<-diag(rep(1,3))-A
M[,3]<-rep(1,3)
solve(t(M),b=c(0,0,1))

## [1] 0.4 0.4 0.2

Viterbi algorithm

Input

The observation space $O=\{o_{1},o_{2},\dots ,o_{N}\}$ , the state space $S=\{s_{1},s_{2},\dots ,s_{K}\}$ , an array of initial probabilities $\Pi =(\pi _{1},\pi _{2},\dots ,\pi _{K})$ such that $\pi _{i}$ stores the probability that $x_{1}=s_{i}$ , a sequence of observations $ X=(x_{1},x_{2},,x_{T})$ such that $ x_{t}=i$ if the observation at time $t$ is $ o_{i}$ , transition matrix $A$ of size $ KK$ such that $ A_{ij}$ stores the transition probability of transiting from state {s_{i}} to state {s_{j}} , emission matrix $B$ of size $K\times N$ such that $B_{ij$ stores the probability of observing $o_{j}$ from state $s_{i}$ . Output The most likely hidden state sequence $Z=(z_{1},z_{2},\ldots ,z_{T})$

A<-matrix(c(0.3,0.7,0,0,0.9,0.1,0.6,0,0.4),3,3,byrow = TRUE)
B<-matrix(c(0.5,0.2,0.3,0,0,0,0,0,0,0.2,0.7,0.1,0,0,0,0,0,0.1,0,0.5,0.4),7,3)
X<-c(1,3,4,6)
M<-diag(rep(1,3))-A
M[,3]<-rep(1,3)
Pi<-solve(t(M),b=c(0,0,1))

SimulationHMM<-function(Pi,A,B,n){
  Z<-rep(0,n)
  X<-rep(0,n)
  K<-length(Pi)
  N<-nrow(B)
  Z[1]<-sample(1:K,prob = Pi,size = 1,replace=TRUE)
  X[1]<-sample(1:N,prob=B[,Z[1]],size=1)
  for (i in 2:n){
    Z[i]<- sample(1:K,prob=A[Z[i-1],],size=1)
    X[i]<- sample(1:N,prob=B[,Z[i-1]],size=1)
  }
  return(list(X=X,Z=Z))
}


Hmmsimu<-SimulationHMM(Pi,A,B,100)
plot(Hmmsimu$X,col=Hmmsimu$Z)

SimulationHMMgauss<-function(Pi,A,n){
  Z<-rep(0,n)
  X<-rep(0,n)
  K<-length(Pi)
  N<-nrow(B)
  Z[1]<-sample(1:K,prob = Pi,size = 1,replace=TRUE)
  X[1]<-rnorm(1,mean=Z[1])
  for (i in 2:n){
    Z[i]<- sample(1:K,prob=A[Z[i-1],],size=1)
    X[i]<-rnorm(1,mean=2*Z[i])
  }
  return(list(X=X,Z=Z))
}

Hmmsimu<-SimulationHMMgauss(Pi,A,100)
plot(Hmmsimu$X,col=Hmmsimu$Z)

Viterbi<-function(Pi,A,B,X){
 K<-nrow(A)
 T<-length(X)
 S<-matrix(0,K,T)
 logV<-matrix(-Inf,K,T)
 Zest<-rep(0,T)
for (k in 1:K){
   logV[k,1]<-log(B[X[1],k])+log(Pi[k])
   S[k,1]<-0
 }
# Forward
 for (t in (2:T))
   for (k in (1:K)){
    logV[k,t]=max(logV[,t-1]+log(A[,k])+log(B[X[t],k]))
    S[k,t-1]=which.max(logV[,t-1]+log(A[,k])+log(B[X[t],k]))
   }
 # Backward
 Zest[T]<-which.max(logV[,T])
 for (t in (T-1):1)
   Zest[t]<-S[Zest[t+1],t]
 return(Zest)
 }
Viterbi(Pi,A,B,X)

## [1] 1 2 2 3

#Simulation

# Chaine de Markov sur les états caché
A<-matrix(c(0.95,0.1,0.05,0.9),2,2)
# Loi d'emission
B<-matrix(c(rep(1/6,6),rep(1/10,5),5/10),6,2)

dishonestCasino<-function(n,A,B){
  K<-nrow(A) # nb d'état caché
  L<-nrow(B) # nb d'état possible pour X_t
  # Prenons un Pi uniforme
  Pi<-rep(1/K,K)
  
  # X obs, Z caché
  Z<-rep(0,n); X<-rep(0,n)
  # Simulation de l'état caché
  Z[1]<-sample(1:K,prob=Pi,size=1)
  for (t in 2:n)
      Z[t]<-sample(1:K,prob=A[Z[t-1],],size=1)
  for (t in 1:n)
      X[t]<-sample(1:L,prob=B[,Z[t]],size=1)
  
  return(list(X=X,Z=Z))
}

dishonestCasino(n=100,A,B)->games
plot(games$X,col=c("green","red")[games$Z])

games$Z

##   [1] 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 2 1 1 1 1 2 2 2 2 2 2 2 2
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

#table(games$Z,res)

Basic Markov chain for langage recognition

Christophe Ambroise

9/8/2020

Estimation of stationnary distribution

Viterbi algorithm