Untitled

n_obs = 100
spike_perc = .1

###################

set.seed(123)

n_spikes = floor(spike_perc * n_obs)
inter_arrival = rexp(n_obs)
xv = cumsum(inter_arrival)

mu_spike = 5
sd_spike = sqrt(mu_spike)
sd_noise = mu_spike/75
spike_events = sample(n_obs, n_spikes)

flat_noise = rnorm(n_obs, 0, sd_noise)
spike_ampl = abs(rnorm(n_spikes, mu_spike, sd_spike))

yv = rep(0, n_obs)
yv[spike_events] = yv[spike_events]  + spike_ampl
yv = cumsum(yv)
yv = yv + flat_noise

dat = c(0,diff(yv))

library(igraph)

# Helper function. Equivalently, use mefa::stack
row_col_from_condensed_index = function(n, ix){
  nr=ceiling(n-(1+sqrt(1+4*(n^2-n-2*ix)))/2)
  nc=n-(2*n-nr+1)*nr/2+ix+nr
  cbind(nr,nc)
}

# Algorithm parameters, may require tuning
rq = .2 # distance quantile for thresholding (higher->fewer outliers)
p = .1  # percent of population necessary for a component to be flagged as an inlier
method='euclidean'

# calculate inter-observation distances and threshold to define adjacency matrix
n=NROW(dat)
d = dist(dat, method=method)
r = quantile(d, rq)
edges = t(sapply(which(d<=r), function(x) row_col_from_condensed_index(n,x)))

# build adjacency graph
g = graph.empty(n, directed=FALSE)
g[from=edges[,1], to=edges[,2]]=1
#g=nng(data.frame(dat), k=5, use.fnn=TRUE)
V(g)$name = 1:n

# Flag outliers based on component size
components = clusters(g, mode="strong")
csize = components$csize[components$membership]
outliers = V(g)[csize<=p*n]$name