B Support Code
library(knitr)
library(kableExtra)
library(segmentr)
library(tibble)
library(magrittr)
library(purrr)
knitr::opts_chunk$set(
cache=TRUE,
echo=FALSE
)
source("helper.R")
n <- 100
B1 <- sample(1:2, n, replace = TRUE)
B2 <- sample(1:2, n, replace = TRUE)
B3 <- sample(1:2, n, replace = TRUE)
B4 <- sample(1:2, n, replace = TRUE)
B5 <- sample(1:2, n, replace = TRUE)
B6 <- sample(1:2, n, replace = TRUE)
X1 <- B1
X2 <- B1 - B2
X3 <- B2
X4 <- B1 + B2
X5 <- B1
X6 <- B3
X7 <- B3 - B4
X8 <- B4
X9 <- B3 + B4
X10 <- B3
X11 <- B5
X12 <- B5 - B6
X13 <- B6
X14 <- B5 + B6
X15 <- B5
D_example <- cbind(X1, X2, X3, X4, X5, X6, X7, X8,
X9, X10, X11, X12, X13, X14, X15)
head(D_example) %>%
kable(caption="Sample values of the correlated random
variables defined in (4.1).")
multivariate_cost = function(X) -multivariate(X) + 2 ^ ncol(X)
results_multivariate_penalized <- segment(
D_example,
cost = multivariate_cost,
algorithm = "exact"
)
print_results_table(
results_multivariate_penalized,
caption="Segment tables as defined in
(4.1)."
)
segment(D_example,
cost = function(X) -multivariate(X),
algorithm = "exact") %>%
print_results_table(
caption="Results of segmentation using a non-penalized
multivariate cost")
heterogeneity_cost <- function(X) {
mean_value <- mean(X, na.rm = T)
if (is.na(mean_value)) {
0
} else {
sum((X - mean_value)^2)
}
}
penalized_heterogeneity_cost <- function(X) heterogeneity_cost(X) + 1
make_segment <- function(n, p) {
matrix(rbinom(100 * n, 1, p), nrow = 100)
}
D_genetic <- cbind(
make_segment(5, 0.9),
make_segment(10, 0.1),
make_segment(5, 0.9)
)
segment(
D_genetic,
cost = penalized_heterogeneity_cost,
algorithm = "hieralg"
) %>%
print_results_table(
caption="Results of segmentation by applying the
cost function defined in
(4.5) to a sample of size
10 of the model defined in (4.6).
")
expected_multivariate_example <- c(6, 11)
deviation <- partial(
segment_distance,
changepoints2=expected_multivariate_example
)
exact_multivariate_changepoints <- segment(
D_example,
cost = multivariate_cost,
algorithm = "exact"
)$changepoints
hieralg_multivariate_changepoints <- segment(
D_example,
cost = multivariate_cost,
algorithm = "hierarchical"
)$changepoints
hybrid_multivariate_changepoints <- segment(
D_example,
cost = multivariate_cost,
algorithm = "hybrid"
)$changepoints
hybrid_multivariate_changepoints_threshold <- segment(
D_example,
cost = multivariate_cost,
algorithm = "hybrid",
threshold = 4
)$changepoints
tribble(
~`Description`,
~`Changepoints`,
~`Hausdorff Distance`,
"Expected solution",
comma_format(expected_multivariate_example),
deviation(expected_multivariate_example),
"Exact algorithm estimate",
comma_format(exact_multivariate_changepoints),
deviation(exact_multivariate_changepoints),
"Hierarchical algorithm estimate",
comma_format(hieralg_multivariate_changepoints),
deviation(hieralg_multivariate_changepoints),
"Hybrid algorithm estimate with threshold 50",
comma_format(hybrid_multivariate_changepoints),
deviation(hybrid_multivariate_changepoints),
"Hybrid algorithm estimate with threshold 4",
comma_format(hybrid_multivariate_changepoints_threshold),
deviation(hybrid_multivariate_changepoints_threshold)
) %>%
kable(caption="Comparison of solutions of different algorithms
to segment the data set described in (4.1),
measuring how far each solution is from the ideal
solution using the Hausdorff distance.")
expected_mean_example <- c(6, 16)
deviation <- partial(
segment_distance,
changepoints2=expected_mean_example
)
exact_mean_changepoints <- segment(
D_genetic,
cost = penalized_heterogeneity_cost,
algorithm = "exact"
)$changepoints
hieralg_mean_changepoints <- segment(
D_genetic,
cost = penalized_heterogeneity_cost,
algorithm = "hierarchical"
)$changepoints
hybrid_mean_changepoints <- segment(
D_genetic,
cost = penalized_heterogeneity_cost,
algorithm = "hybrid"
)$changepoints
hybrid_mean_changepoints_threshold <- segment(
D_genetic,
cost = penalized_heterogeneity_cost,
algorithm = "hybrid",
threshold = 4
)$changepoints
tribble(
~`Description`,
~`Changepoints`,
~`Hausdorff Distance`,
"Expected solution",
comma_format(expected_mean_example),
deviation(expected_mean_example),
"Exact algorithm estimate",
comma_format(exact_mean_changepoints),
deviation(exact_mean_changepoints),
"Hierarchical algorithm estimate",
comma_format(hieralg_mean_changepoints),
deviation(hieralg_mean_changepoints),
"Hybrid algorithm estimate with threshold 50",
comma_format(hybrid_mean_changepoints),
deviation(hybrid_mean_changepoints),
"Hybrid algorithm estimate with threshold 4",
comma_format(hybrid_mean_changepoints_threshold),
deviation(hybrid_mean_changepoints_threshold)
) %>%
kable(caption="Comparison of solutions of different algorithms
to segment the data set described in (4.6),
measuring how far each solution is from the ideal
solution using the Hausdorff distance.")
library(segmentr)
library(tidyr)
library(tibble)
library(dplyr)
library(lubridate)
library(magrittr)
library(purrr)
library(knitr)
library(kableExtra)
data(berlin)
as_tibble(berlin, rownames="station") %>%
mutate(`..`="..") %>%
select(station, `2010-01-01`:`2010-01-03`,
`..`, `2011-12-29`:`2011-12-31`) %>%
kable(caption="
First and last columns of the `berlin` dataset
") %>%
column_spec(1, width="5em")
berlin %>%
colMeans() %>%
enframe("time", "temperature") %>%
mutate_at(vars(time), ymd) %>%
with(plot(time, temperature, cex=0.2))
expected_berlin <- list(
changepoints=c(200, 360, 570),
segments=list(1:199, 360:569, 570:ncol(berlin))
)
plot_results(expected_berlin, berlin)
print_results_table(
expected_berlin,
caption="Expected values to be found when segmenting
the Berlin dataset"
)
residual_cost <- function (data) {
fit <- t(data) %>%
as_tibble() %>%
rowid_to_column() %>%
gather(station, temperature, -rowid) %>%
with(lm(temperature ~ rowid))
mean(fit$residuals ^ 2)
}
tibble(
`Small Size`=residual_cost(berlin[, 2:3]),
`Medium Size`=residual_cost(berlin[, 1:150]),
`Large Size`=residual_cost(berlin)
) %>%
kable(
caption="Sample values of the squared residual cost
function for different sizes of segment
candidates"
)
results_non_penalized <- segment(
berlin,
cost = residual_cost,
algorithm = "hierarchical"
)
print_results_table(
results_non_penalized,
caption="Results of the segmentation algorithm
using a non-penalized cost function"
)
plot_results(results_non_penalized, berlin)
plot_curve(~ exp(0.3*(. - 50)) + exp(0.3 * (-. + 50)),
from = 0, to = 100, type="l")
penalized_cost <- auto_penalize(berlin,
cost = residual_cost)
results_auto_penalized <- segment(
berlin,
cost = residual_cost,
algorithm = "hierarchical"
)
print_results_table(
results_auto_penalized,
caption="Results of the segmentation algorithm
using an auto-penalized cost function"
)
plot_results(results_auto_penalized, berlin)
penalized_cost <- auto_penalize(
berlin,
cost = residual_cost,
big_segment_penalty = 2
)
results_adjusted_penalized <- segment(
berlin,
cost = penalized_cost,
algorithm = "hierarchical"
)
print_results_table(
results_adjusted_penalized,
caption="Results of the segmentation algorithm
using an adjusted penalized cost function"
)
plot_results(results_adjusted_penalized, berlin)
monthly_berlin <- berlin %>%
as_tibble(rownames = "station") %>%
gather(time, temperature, -station) %>%
mutate(month = floor_date(ymd(time), "month")) %>%
group_by(station, month) %>%
summarize(temperature = mean(temperature)) %>%
spread(month, temperature) %>% {
stations <- .$station
result <- as.matrix(.[, -1])
rownames(result) <- stations
result
}
monthly_berlin %>%
colMeans() %>%
enframe("time", "temperature") %>%
mutate_at(vars(time), ymd) %>%
with(plot(time, temperature, cex=0.2))
penalized_cost <- auto_penalize(
monthly_berlin,
cost = residual_cost,
small_segment_penalty = 100
)
results_downscaled <- segment(
monthly_berlin,
cost = penalized_cost,
algorithm = "exact"
)
print_results_table(
results_downscaled,
caption="Results of the segmentation algorithm
using an auto-penalized cost function over
downsampled berlin data"
)
plot_results(results_downscaled, monthly_berlin)
rescaled_changepoints <- round(
results_downscaled$changepoints
* ncol(berlin) / ncol(monthly_berlin)
)
results_rescaled <- with_segments(
changepoints=rescaled_changepoints,
len=ncol(berlin)
)
print_results_table(
results_rescaled,
caption="Results of the segmentation algorithm
using an auto-penalized cost function over
downsampled berlin data, but rescaled to fit the
original data dimensions"
)
plot_results(results_rescaled, berlin)
deviation <- partial(
segment_distance,
changepoints2=expected_berlin$changepoints
)
tribble(
~`Results referenced`,
~`Estimated Changepoints`,
~`Hausdorff Distance to Ideal`,
"Table 5.2",
comma_format(expected_berlin$changepoints),
deviation(expected_berlin$changepoints),
"Table 5.4",
comma_format(results_non_penalized$changepoints),
deviation(results_non_penalized$changepoints),
"Table 5.5",
comma_format(results_auto_penalized$changepoints),
deviation(results_auto_penalized$changepoints),
"Table 5.6",
comma_format(results_adjusted_penalized$changepoints),
deviation(results_adjusted_penalized$changepoints),
"Table 5.8",
comma_format(results_rescaled$changepoints),
deviation(results_rescaled$changepoints)
) %>% kable(
caption="Hausdorff distance for different
segmentation attepmts over the Berlin weather data set"
) %>% column_spec(2, width="15em")
rsquared_cost <- function (data) {
as_tibble(t(data)) %>%
rowid_to_column() %>%
gather(station, temperature, -rowid) %>%
with(lm(temperature ~ rowid)) %>%
summary %>%
.$adj.r.squared %>%
{ -. }
}
tibble(
`Small Size`=rsquared_cost(berlin[, 2:3]),
`Medium Size`=rsquared_cost(berlin[, 1:150]),
`Large Size`=rsquared_cost(berlin)
) %>%
kable(
caption="Sample values of the R-squared
cost function for different sizes of segment
candidates"
)
penalized_cost <- auto_penalize(
berlin,
cost = rsquared_cost
)
results <- segment(
berlin,
cost = penalized_cost,
algorithm = "hierarchical"
)
print_results_table(
results,
caption="Results of the segmentation algorithm
using an auto-penalized R-squared cost function over
Berlin data"
)
plot_results(results, berlin)
penalized_cost <- auto_penalize(
berlin,
cost = rsquared_cost,
small_segment_penalty = 1.5
)
results <- segment(
berlin,
cost = penalized_cost,
algorithm = "hierarchical"
)
print_results_table(
results,
caption="Results of the segmentation algorithm
using an adjusted penalized R-squared cost
function over Berlin data"
)
plot_results(results, berlin)
sub_berlin <- berlin[, 1:547]
penalized_cost <- auto_penalize(
sub_berlin,
cost = rsquared_cost
)
results <- segment(
sub_berlin,
cost = penalized_cost,
algorithm = "hierarchical"
)
print_results_table(
results,
caption="Results of the segmentation algorithm
using an auto-penalized R-squared cost
function over a subset of Berlin data"
)
plot_results(results, sub_berlin)
source("helper.R")
library(microbenchmark)
data <- makeRandom(20, 100)
bench <- microbenchmark(
segment(data,
cost = function(x) -multivariate(x),
algorithm = "exact"),
segment(data,
cost = function(x) -r_multivariate(x),
algorithm = "exact"),
segment(data,
cost = function(x) -multivariate(x),
algorithm = "hierarchical"),
segment(data,
cost = function(x) -r_multivariate(x),
algorithm = "hierarchical"),
times = 1
)
print_benchmark(
bench,
caption="Execution time comparison
between native C++ and interpreted R code"
)
data <- makeRandom(100, 10)
doMC::registerDoMC(4)
bench <- microbenchmark(
segment(data,
cost = function(x) -multivariate(x),
algorithm = "exact",
allow_parallel = FALSE),
segment(data,
cost = function(x) -multivariate(x),
algorithm = "exact",
allow_parallel = TRUE),
segment(data,
cost = function(x) -multivariate(x),
algorithm = "hierarchical",
allow_parallel = FALSE),
segment(data,
cost = function(x) -multivariate(x),
algorithm = "hierarchical",
allow_parallel = TRUE),
times = 1
)
print_benchmark(
bench,
caption="Execution time comparison
between parallel and single-threaded computation
with a data set of 100 samples and 10 columns"
)
data <- makeRandom(5, 100)
doMC::registerDoMC(4)
bench <- microbenchmark(
segment(data,
cost = function(x) -multivariate(x),
algorithm = "exact",
allow_parallel = FALSE),
segment(data,
cost = function(x) -multivariate(x),
algorithm = "exact",
allow_parallel = TRUE),
segment(data,
cost = function(x) -multivariate(x),
algorithm = "hierarchical",
allow_parallel = FALSE),
segment(data,
cost = function(x) -multivariate(x),
algorithm = "hierarchical",
allow_parallel = TRUE),
times = 1
)
print_benchmark(
bench,
caption="Execution time comparison
between parallel and single-threaded computation
with a data set of 5 samples and 100 columns"
)
install.packages("segmentr")
library("segmentr")
data <- rbind(
c(1, 1, 0, 0, 0, 1023, 134521, 12324),
c(1, 1, 0, 0, 0, -20941, 1423, 14334),
c(1, 1, 0, 0, 0, 2398439, 1254, 146324),
c(1, 1, 0, 0, 0, 24134, 1, 15323),
c(1, 1, 0, 0, 0, -231, 1256, 13445),
c(1, 1, 0, 0, 0, 10000, 1121, 331)
)
segment(
data,
algorithm = "exact",
cost = function(X) -multivariate(X) + 0.01*exp(ncol(X))
)
vignette("segmentr")
Castro, Bruno M., Renan B. Lemes, Jonatas Cesar, Tábita Hünemeier, and Florencia Leonardi. 2018. “A Model Selection Approach for Multiple Sequence Segmentation and Dimensionality Reduction.” Journal of Multivariate Analysis 167: 319–30. https://doi.org/https://doi.org/10.1016/j.jmva.2018.05.006.
Ceballos, Francisco, Peter Joshi, David W. Clark, Michèle Ramsay, and James F. Wilson. 2018. “Runs of Homozygosity: Windows into Population History and Trait Architecture.” Nature Reviews Genetics 19 (January). https://doi.org/10.1038/nrg.2017.109.
Eddelbuettel, Dirk, and Romain François. 2011. “Rcpp: Seamless R and C++ Integration.” Journal of Statistical Software 40 (8): 1–18. https://doi.org/10.18637/jss.v040.i08.
Maidstone, Robert, Toby Hocking, Guillem Rigaill, and Paul Fearnhead. 2017. “On Optimal Multiple Changepoint Algorithms for Large Data.” Statistics and Computing 27 (2): 519–33. https://doi.org/10.1007/s11222-016-9636-3.
Mello, Thales. 2019. “Segmentr: An R Package to Segment Data with Minimal Total Cost (Aka Change Points).” GitHub Repository. https://github.com/thalesmello/segmentr; GitHub.
Mello, Thales, and Florencia Leonardi. 2019. Segmentr: Segment Data Minimizing a Cost Function. https://CRAN.R-project.org/package=segmentr.
Munkres, J. R. 2000. Topology. Featured Titles for Topology Series. Prentice Hall, Incorporated. https://books.google.com.br/books?id=XjoZAQAAIAAJ.
Park, K. I. 2017. Fundamentals of Probability and Stochastic Processes with Applications to Communications. Springer International Publishing. https://books.google.com.br/books?id=cd2SswEACAAJ.
R Core Team. 2018. “R: A Language and Environment for Statistical Computing.” Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.
Wetterdienst, Deutscher. 2019. “Climate Data Center - Ftp Server.” https://www.dwd.de/EN/climate_environment/cdc/cdc.html; Deutscher Wetterdienst.
Wickham, Hadley. 2015. R Packages. 1st ed. O’Reilly Media, Inc. http://r-pkgs.had.co.nz/.