Xylophone gag project code

### Seth Frey
### Error Detection, applied in this case to the Irish tune typical of the 
###  Looney Tunes Xylophone Gag
### Class Project for Chris Rafael's Music Procesing, INFO-I547 at IU
### Spring 2010 (4/26/10)

### Boilerplate
rm( list=ls() )
library(tuneR)
setWavPlayer("/usr/local/bin/playRWave")
path <- "~/Desktop/projecto/music_processing/"
### useful library for error analysis
#source(paste(path, "code/spectrogram.r", sep="" ), echo=FALSE)

### The score
### endearing young charms in MIDI
#s_notes<- c(E4, D4, C4, D4, C4, C4, E4, G4, F4, A4, C5, C5, B4, A4)
eyc_orig_notes <-  c(64, 62, 60, 62, 60, 60, 64, 67, 65, 69, 72, 72)#, 71, 69)
score <- eyc_orig_notes
timing<-  c(.25, .25, .75,.25, 0.5,0.5,0.5 , 0.5 , 0.5 , 0.5 , 0.5 , .75,.25,.25) 
### test the above representation of the score:
### scale timing info
#timing<-  timing /sum(timing)
### less than two seconds
#duration<- 150000
#sample <-c()
#for (i in 1:length(score)){ sample<-c(sample, rep(score[i], timing[i]*duration))}
#v = 2*pi*440.*2^((sample-69)/12)/sr
#z = round((2^13)*sin(cumsum(v)))
### insert click at errors
#z[ierrors] <- 32000
#u.samp = Wave(z, samp.rate = sr, bit=16)   # make wave struct
#play(u.samp)
                          
### Audio of endearing young charms, with or without mistakes
### bad (these don't get recognized well by the HMM)
w = readWave(paste(path, "final_project/bugs2001.wav", sep="" ))
w = readWave(paste(path, "final_project/bugs1001.wav", sep="" ))
### maybe (these kind of get recognized well by the HMM, with mistakes
###  that get labeled as player errors by my code)
w = readWave(paste(path, "final_project/travis001good.wav", sep="" ))
w = readWave(paste(path, "final_project/travis002good.wav", sep="" ))
### good  (these get recognized well by the HMM, and errors in them get caught
###  (though there are a few identification errors labeled as player errors
###  in these samples))
w = readWave(paste(path, "final_project/travislow003.wav", sep="" ))
w = readWave(paste(path, "final_project/decent001err2.wav", sep="" ))
w = readWave(paste(path, "final_project/travismistake003.wav", sep="" ))
w = readWave(paste(path, "final_project/travismistake006.wav", sep="" ))
w = readWave(paste(path, "final_project/yosemite1001.wav", sep="" ))
w = readWave(paste(path, "final_project/travislow004.wav", sep="" ))

                          
### This project is a modification of the HMM code provided as the solution to HW6.
### http://www.music.informatics.indiana.edu/courses/I547/hw6_sln.r
### or
### http://enfascination.com/forever/xylophonegag/hw6_sln.r
### I will remove all of the original comments from the original code to high-
###  light my own comments.  For the unfamiliar, this may obscure the workings 
###  of the HMM.

### Extract some parameters from the audio
bits = w@bit
sr = w@samp.rate
y = w@left                                
#y = y[30000:1000000]   ### take subset of audio, for faster testing                      
u = Wave(y, samp.rate = sr, bit=bits)   
### preserve original
u.orig = Wave(y, samp.rate = sr, bit=bits)   
#play(u.orig)                  

### Other Important parameters
lop = 59   # the lowest possible pitch 
hip = 96   # the highest possible pitch 
N = 1024   ### the frame length
skiplen = N/2
frames = floor(length(y)/skiplen)-1  ### The number of frames, given N
notes = length(lop:hip)              ### Number of notes represented
### minimum number of frames constituting a played note (must be small
###  for realistic run times)
state_per_note = 4                   
### minimum number of frames constituting a played error 
state_per_error <- 8

### Transition probability matrix (almost all borrowed code)
trans = matrix(0,notes*state_per_note,notes*state_per_note)
for (i in 1:ncol(trans)){
  if (i%%state_per_note>0){
    trans[i,i] = 1
    trans[i,i+1] = 1
  }
}
### I had some problems with identifying a note in the wrong octave. 
###  The code below makes note changes within an octave more likely 
###  than these more than 6 half steps away, in either direction.  
###  Maybe useless, haven't done a thorough error analysis.
close_note_neighborhood <- 6
for (i in 1:notes) {
    for (j in 1:notes) {
        if (i!=j) {
            if (abs(i-j)<close_note_neighborhood) {
                trans[i*state_per_note,(j-1)*state_per_note+1] = 2
            }
            else {
                trans[i*state_per_note,(j-1)*state_per_note+1] = 1
            }
        }
}
}
### Normalize transition matrix
for (i in 1:ncol(trans))
  trans[i,]=trans[i,]/sum(trans[i,])


#######  Creating the emission probability function  #######
### create templates for each note.
wid = 1
decay = 4                
tplt = matrix(.01,hip,N/2)
for (j in lop:hip) {      
  f = 440*2^((j-69)/12)   
  b = f*N/sr              
  for (h in 1:15) {       
    if (j == lop) next            # lop is the rest = flat template
    tplt[j,] = tplt[j,] + 2^(-h/decay)*dnorm(1:(N/2),mean=b*h,sd = wid)
  }
  tplt[j,] = tplt[j,]/sum(tplt[j,]) 
#  plot(tplt[j,],type='l')           
#  scan()                            # wait for human to type key
}
t = seq(from=0,by=2*pi/N,length=N)     
win = (1 + cos(t-pi))/2

#### The function of computing the logorithm emission probabilities
log_emit = function (observation,cs,energy) {
  if (energy > 50000000){    #### this is how to classify rest
    Y = fft(win*observation)
    log_emit = sum(log(tplt[floor((cs-1)/state_per_note)+lop,])*Mod(Y[1:(N/2)]))
  }
  else{   ####  if the energy is less than a certain value (e.g. 50000000) the only possible states can generating this observation are rest states.
    log_emit = -Inf
    if (cs<=state_per_note) log_emit = 0
  }
  log_emit
}

### This is the core of the recognition
### All of this is untouched.  Error detection doesn't start until after recog-
###  nition and backtracking
### XXXXXXXXX
#######  Dynamic programming  #######

### creating dynamic programming matrix
lat = matrix(-Inf,ncol(trans),frames)           # the array that holds the dynamic programming scores
best = matrix(0,ncol(trans),frames)          # the array of best predecessors.

### initialize
lat[state_per_note*(1:notes)-(state_per_note-1),1] = 1/notes       # possible start states: every first state of a note
             
#### start computation
for (j in 2:frames) {                ### loop1: main loop of program (for all frames)
  s = (j-1)*skiplen +1               # start of frame
  observation = y[s:(s+N-1)]         # the frame
  energy = sum(observation^2)
### cs = current_state ; ps = previous_state
  for (cs in 1:ncol(trans)) {        ### loop2: loop for current state
    if (cs%%state_per_note==1) log_emit_value = log_emit(observation,cs,energy)
    for (ps in 1:ncol(trans)) {        ### loop3: the possible predecessor states
      if (trans[ps,cs]==0) next
      temp = lat[ps,j-1] + log_emit_value + log(trans[ps,cs])# score by data model
      if (temp > lat[cs,j]){
        lat[cs,j] = temp
        best[cs,j] = ps
      }
    }}
}

#######  Trace back  #######

hat = rep(0,frames)                            # the optimal parse
hat[frames] = which.max(lat[,frames])          # know the optimal last note
for (j in frames:2)  hat[j-1] = best[hat[j],j] # trace back optimal parse



### Visualize results
hat = floor((hat-1)/state_per_note+lop)  ##### every state_per_note states represent the same pitch
hat.orig <- hat
hat<- hat.orig 
plot(hat)

### XXXXXXXXX
### End of shameless copying, transitioning into slightly shameful copying


### Simplify this score (this project does not attempt timing problems, 
###  only hitting the wrong note)
###  the best way to do this is to remove double notes and then make the 
###  system robust to silence, which is already represented in the HMM
###  implementation above
### Removing doubles
xlist <- c()
for (i in 2:length(score)) {
    if (score[i] == score[i-1]) {
        xlist <- c(xlist, i)
    }
}
score <- score[-xlist]

### Find the key of the audio, and shift the whole score up or down to match it
key <-0
for (i in (state_per_error+1):(frames-1)) { 
    key.window <- hat[(i-state_per_error):i]
    if ((length(unique(key.window))==1) && (
        unique(key.window) != lop)) {
        key <- key.window[1]
        break
    }
}
score.old <- score  ### saving old unshifted score
score <- score - (score[1] - key)


### Begin error detection/annotation
### detect errors while iterating through solution (the recognized audio 
###  representation "hat") and building waveform
### location in (index of) the score
### Current index of solution in the score (where 
###  we are in my representation of the song.
iscore <- 1
### A place to store the indices of errors.  Where in the score
###  was a mistake made
ierrors <- c()

### For converting the unmarked-up and marked-up versions of the 
###  tune from midi representation into audio.
t = 0
t.err<-0
in.error<-FALSE   ### The error state.  Starting in not-in-error state
for (i in (state_per_error+1):(frames-1)) { 
    ### This is 8 frames long, and I shift through it to find mistakes
    window <- hat[(i-state_per_error):i]
    ### I am allowed to be in lop (silence) or in the location of the
    ###  score that I think I'm in.
    legal <- c(lop, score[iscore])
    ### marked-up and non-marked-up pitch of current midi note.  Identical
    ###  but I'm keeping versions separate for comparison
    v.err = 2*pi*440.*2^((hat[i]-69)/12)/sr
    v = 2*pi*440.*2^((hat[i]-69)/12)/sr
    ### Silence
    if (hat[i] == lop) { v = 0; v.err <- 0 }
    ### If I am in an error, look for a return to the previous note, 
    ###  or the expected next note, or the one after that, just in case 
    ###  the expected next note get skipped.  Also watch for silence,
    ###  A sign that the player won't return to the previous note after
    ###  this mistake.
    if (in.error){
        ### If player returns to previous note, leave error state
        if (0 < length(intersect(c(score[i]), window))) {
            in.error <- FALSE
        }
        ### If the window enters one of the future pitches, leave error state and
        ###  advance past current location in the score.
        else if (0 < length(intersect(c(lop, score[iscore+1], score[iscore+2]), window))) {
            in.error <- FALSE
            ###  But, of course, don't advance in score if at end of score.
            if (iscore < length(score)) iscore <- iscore+1
        }
        ### If I haven't left the error state, I am still in it and shall
        ###  insert a high pitched screech to indicate recognition of error
        else {
            ierrors <- c(ierrors, iscore)
            v.err = 2*pi*440.*2^((hip-69)/12)/sr
        }

    ### If not in error state
    } else {
        ### If the current window has no silence or the expected note
        if (0 == length(intersect(legal, window))) {
            ### If the current window also doesn't have the next note
            ###  enter the error state and log the location in ierror,
            ###  the list of locations-in-error
            if (0 == length(intersect(c(legal, score[iscore+1]), window))) {
                ### Debugging output
                    #print ("at")
                    #print(i)
                    #print(hat[i])
                    #print(iscore)
                    #print("expect")
                    #print(c(legal, score[iscore+1], score[iscore+2]))
                    #print("in")
                    #print(window)
                ierrors <- c(ierrors, iscore)
                in.error <- TRUE
            }
            ### If current window lacks silence and current note
            ###  but has next note, advance in the score
            else {
                ### unless you are already at the end of the score
                if (iscore < length(score)) iscore <- iscore+1
            }
        }
    }
    ### For conversion of short midirepresentation to a waveform suitable 
    ###  for conversion back to real audio
    t = c(t,rep(v,N/2))
    t.err = c(t.err,rep(v.err,N/2))
}
### End of error detection/annotation

### create scaled waveform of both HMM representation of audio and 
###  version of audio annotated with "caught" mistakes
z = round((2^13)*sin(cumsum(t)))
z.caught = round((2^13)*sin(cumsum(t.err)))
### Convert both versions to .wav file
u = Wave(z, samp.rate = sr, bit=16)   # make wave struct
u.caught = Wave(z.caught, samp.rate = sr, bit=16)   # make wave struct

### show where in the score mistakes were made
ierrors <- unique(ierrors) 
print("the score")
score
print("the indices of errors")
ierrors
print("the notes in the score corresponding to those indices")
score[ierrors]

### Play original audio, than recognized audio, then error-annotated audio
play(u.orig)
play(u)
play(u.caught)
### For writing audio to file
#writeWave(u, "~/Desktop/projecto/music_processing/final_project/Good_one_err11_01_frame10_hmm_bad.wav")
Xylophone gag project code

From enfascination