B Support Code

library(knitr)
library(kableExtra)
library(segmentr)
library(tibble)
library(magrittr)
library(purrr)
knitr::opts_chunk$set(
  cache=TRUE,
  echo=FALSE
)
source("helper.R")
n <- 100
B1 <- sample(1:2, n, replace = TRUE)
B2 <- sample(1:2, n, replace = TRUE)
B3 <- sample(1:2, n, replace = TRUE)
B4 <- sample(1:2, n, replace = TRUE)
B5 <- sample(1:2, n, replace = TRUE)
B6 <- sample(1:2, n, replace = TRUE)

X1 <- B1
X2 <- B1 - B2
X3 <- B2
X4 <- B1 + B2
X5 <- B1
X6 <- B3
X7 <- B3 - B4
X8 <- B4
X9 <- B3 + B4
X10 <- B3
X11 <- B5
X12 <- B5 - B6
X13 <- B6
X14 <- B5 + B6
X15 <- B5
D_example <- cbind(X1, X2, X3, X4, X5, X6, X7, X8,
                   X9, X10, X11, X12, X13, X14, X15)
head(D_example) %>%
  kable(caption="Sample values of the correlated random
        variables defined in (4.1).")
multivariate_cost = function(X) -multivariate(X) + 2 ^ ncol(X)

results_multivariate_penalized <- segment(
  D_example,
  cost = multivariate_cost,
  algorithm = "exact"
)

print_results_table(
  results_multivariate_penalized,
  caption="Segment tables as defined in
  (4.1)."
)
segment(D_example,
        cost = function(X) -multivariate(X),
        algorithm = "exact") %>%
  print_results_table(
    caption="Results of segmentation using a non-penalized
      multivariate cost")
heterogeneity_cost <- function(X) {
  mean_value <- mean(X, na.rm = T)
  if (is.na(mean_value)) {
    0
  } else {
    sum((X - mean_value)^2)
  }
}

penalized_heterogeneity_cost <- function(X) heterogeneity_cost(X) + 1

make_segment <- function(n, p) {
  matrix(rbinom(100 * n, 1, p), nrow = 100)
}

D_genetic <- cbind(
  make_segment(5, 0.9),
  make_segment(10, 0.1),
  make_segment(5, 0.9)
)

segment(
  D_genetic,
  cost = penalized_heterogeneity_cost,
  algorithm = "hieralg"
) %>%
  print_results_table(
    caption="Results of segmentation by applying the
      cost function defined in
      (4.5) to a sample of size
      10 of the model defined in (4.6).
    ")
expected_multivariate_example <- c(6, 11)

deviation <- partial(
  segment_distance,
  changepoints2=expected_multivariate_example
)

exact_multivariate_changepoints <- segment(
  D_example,
  cost = multivariate_cost,
  algorithm = "exact"
)$changepoints

hieralg_multivariate_changepoints <- segment(
  D_example,
  cost = multivariate_cost,
  algorithm = "hierarchical"
)$changepoints

hybrid_multivariate_changepoints <- segment(
  D_example,
  cost = multivariate_cost,
  algorithm = "hybrid"
)$changepoints

hybrid_multivariate_changepoints_threshold <- segment(
  D_example,
  cost = multivariate_cost,
  algorithm = "hybrid",
  threshold = 4
)$changepoints

tribble(
  ~`Description`,
    ~`Changepoints`,
      ~`Hausdorff Distance`,
  "Expected solution",
    comma_format(expected_multivariate_example),
      deviation(expected_multivariate_example),
  "Exact algorithm estimate",
    comma_format(exact_multivariate_changepoints),
      deviation(exact_multivariate_changepoints),
  "Hierarchical algorithm estimate",
    comma_format(hieralg_multivariate_changepoints),
      deviation(hieralg_multivariate_changepoints),
  "Hybrid algorithm estimate with threshold 50",
    comma_format(hybrid_multivariate_changepoints),
      deviation(hybrid_multivariate_changepoints),
  "Hybrid algorithm estimate with threshold 4",
    comma_format(hybrid_multivariate_changepoints_threshold),
      deviation(hybrid_multivariate_changepoints_threshold)
) %>%
  kable(caption="Comparison of solutions of different algorithms
to segment the data set described in (4.1),
measuring how far each solution is from the ideal
solution using the Hausdorff distance.")
expected_mean_example <- c(6, 16)

deviation <- partial(
  segment_distance,
  changepoints2=expected_mean_example
)

exact_mean_changepoints <- segment(
  D_genetic,
  cost = penalized_heterogeneity_cost,
  algorithm = "exact"
)$changepoints

hieralg_mean_changepoints <- segment(
  D_genetic,
  cost = penalized_heterogeneity_cost,
  algorithm = "hierarchical"
)$changepoints

hybrid_mean_changepoints <- segment(
  D_genetic,
  cost = penalized_heterogeneity_cost,
  algorithm = "hybrid"
)$changepoints

hybrid_mean_changepoints_threshold <- segment(
  D_genetic,
  cost = penalized_heterogeneity_cost,
  algorithm = "hybrid",
  threshold = 4
)$changepoints

tribble(
  ~`Description`,
    ~`Changepoints`,
      ~`Hausdorff Distance`,
  "Expected solution",
    comma_format(expected_mean_example),
      deviation(expected_mean_example),
  "Exact algorithm estimate",
    comma_format(exact_mean_changepoints),
      deviation(exact_mean_changepoints),
  "Hierarchical algorithm estimate",
    comma_format(hieralg_mean_changepoints),
      deviation(hieralg_mean_changepoints),
  "Hybrid algorithm estimate with threshold 50",
    comma_format(hybrid_mean_changepoints),
      deviation(hybrid_mean_changepoints),
  "Hybrid algorithm estimate with threshold 4",
    comma_format(hybrid_mean_changepoints_threshold),
      deviation(hybrid_mean_changepoints_threshold)
) %>%
  kable(caption="Comparison of solutions of different algorithms
to segment the data set described in (4.6),
measuring how far each solution is from the ideal
solution using the Hausdorff distance.")
library(segmentr)
library(tidyr)
library(tibble)
library(dplyr)
library(lubridate)
library(magrittr)
library(purrr)
library(knitr)
library(kableExtra)
data(berlin)
as_tibble(berlin, rownames="station") %>%
  mutate(`..`="..") %>%
  select(station, `2010-01-01`:`2010-01-03`,
         `..`, `2011-12-29`:`2011-12-31`) %>%
  kable(caption="
    First and last columns of the `berlin` dataset
  ") %>%
  column_spec(1, width="5em")
berlin %>%
  colMeans() %>%
  enframe("time", "temperature") %>%
  mutate_at(vars(time), ymd) %>%
  with(plot(time, temperature, cex=0.2))
expected_berlin <- list(
  changepoints=c(200, 360, 570),
  segments=list(1:199, 360:569, 570:ncol(berlin))
)

plot_results(expected_berlin, berlin)
print_results_table(
  expected_berlin,
  caption="Expected values to be found when segmenting
           the Berlin dataset"
)
residual_cost <- function (data) {
  fit <- t(data) %>%
    as_tibble() %>%
    rowid_to_column() %>%
    gather(station, temperature, -rowid) %>%
    with(lm(temperature ~ rowid))

  mean(fit$residuals ^ 2)
}

tibble(
  `Small Size`=residual_cost(berlin[, 2:3]),
  `Medium Size`=residual_cost(berlin[, 1:150]),
  `Large Size`=residual_cost(berlin)
) %>%
  kable(
    caption="Sample values of the squared residual cost
    function for different sizes of segment
    candidates"
  )
results_non_penalized <- segment(
  berlin,
  cost = residual_cost,
  algorithm = "hierarchical"
)

print_results_table(
  results_non_penalized,
  caption="Results of the segmentation algorithm
  using a non-penalized cost function"
)
plot_results(results_non_penalized, berlin)

plot_curve(~ exp(0.3*(. - 50)) + exp(0.3 * (-. + 50)),
           from = 0, to = 100, type="l")
penalized_cost <- auto_penalize(berlin,
                                cost = residual_cost)
results_auto_penalized <- segment(
  berlin,
  cost = residual_cost,
  algorithm = "hierarchical"
)

print_results_table(
  results_auto_penalized,
  caption="Results of the segmentation algorithm
  using an auto-penalized cost function"
)
plot_results(results_auto_penalized, berlin)
penalized_cost <- auto_penalize(
  berlin,
  cost = residual_cost,
  big_segment_penalty = 2
)
results_adjusted_penalized <- segment(
  berlin,
  cost = penalized_cost,
  algorithm = "hierarchical"
)
print_results_table(
  results_adjusted_penalized,
  caption="Results of the segmentation algorithm
  using an adjusted penalized cost function"
)
plot_results(results_adjusted_penalized, berlin)
monthly_berlin <- berlin %>%
  as_tibble(rownames = "station") %>%
  gather(time, temperature, -station) %>%
  mutate(month = floor_date(ymd(time), "month")) %>%
  group_by(station, month) %>%
  summarize(temperature = mean(temperature)) %>%
  spread(month, temperature) %>% {
    stations <- .$station
    result <- as.matrix(.[, -1])
    rownames(result) <- stations
    result
  }

monthly_berlin %>%
  colMeans() %>%
  enframe("time", "temperature") %>%
  mutate_at(vars(time), ymd) %>%
  with(plot(time, temperature, cex=0.2))
penalized_cost <- auto_penalize(
  monthly_berlin,
  cost = residual_cost,
  small_segment_penalty = 100
)

results_downscaled <- segment(
  monthly_berlin,
  cost = penalized_cost,
  algorithm = "exact"
)

print_results_table(
  results_downscaled,
  caption="Results of the segmentation algorithm
    using an auto-penalized cost function over
    downsampled berlin data"
)
plot_results(results_downscaled, monthly_berlin)
rescaled_changepoints <- round(
  results_downscaled$changepoints
  * ncol(berlin) / ncol(monthly_berlin)
)

results_rescaled <- with_segments(
  changepoints=rescaled_changepoints,
  len=ncol(berlin)
)

print_results_table(
  results_rescaled,
  caption="Results of the segmentation algorithm
    using an auto-penalized cost function over
    downsampled berlin data, but rescaled to fit the
    original data dimensions"
)
plot_results(results_rescaled, berlin)
deviation <- partial(
  segment_distance,
  changepoints2=expected_berlin$changepoints
)

tribble(
  ~`Results referenced`,
    ~`Estimated Changepoints`,
      ~`Hausdorff Distance to Ideal`,
  "Table 5.2",
    comma_format(expected_berlin$changepoints),
      deviation(expected_berlin$changepoints),
  "Table 5.4",
    comma_format(results_non_penalized$changepoints),
      deviation(results_non_penalized$changepoints),
  "Table 5.5",
    comma_format(results_auto_penalized$changepoints),
      deviation(results_auto_penalized$changepoints),
  "Table 5.6",
    comma_format(results_adjusted_penalized$changepoints),
      deviation(results_adjusted_penalized$changepoints),
  "Table 5.8",
    comma_format(results_rescaled$changepoints),
      deviation(results_rescaled$changepoints)
) %>% kable(
  caption="Hausdorff distance for different
  segmentation attepmts over the Berlin weather data set"
) %>% column_spec(2, width="15em")
rsquared_cost <- function (data) {
  as_tibble(t(data)) %>%
    rowid_to_column() %>%
    gather(station, temperature, -rowid) %>%
    with(lm(temperature ~ rowid)) %>%
    summary %>%
    .$adj.r.squared %>%
    { -. }
}

tibble(
  `Small Size`=rsquared_cost(berlin[, 2:3]),
  `Medium Size`=rsquared_cost(berlin[, 1:150]),
  `Large Size`=rsquared_cost(berlin)
) %>%
  kable(
    caption="Sample values of the R-squared
    cost function for different sizes of segment
    candidates"
  )
penalized_cost <- auto_penalize(
  berlin,
  cost = rsquared_cost
)
results <- segment(
  berlin,
  cost = penalized_cost,
  algorithm = "hierarchical"
)

print_results_table(
  results,
  caption="Results of the segmentation algorithm
    using an auto-penalized R-squared cost function over
    Berlin data"
)
plot_results(results, berlin)
penalized_cost <- auto_penalize(
  berlin,
  cost = rsquared_cost,
  small_segment_penalty = 1.5
)
results <- segment(
  berlin,
  cost = penalized_cost,
  algorithm = "hierarchical"
)
print_results_table(
  results,
  caption="Results of the segmentation algorithm
    using an adjusted penalized R-squared cost
    function over Berlin data"
)
plot_results(results, berlin)
sub_berlin <- berlin[, 1:547]
penalized_cost <- auto_penalize(
  sub_berlin,
  cost = rsquared_cost
)
results <- segment(
  sub_berlin,
  cost = penalized_cost,
  algorithm = "hierarchical"
)
print_results_table(
  results,
  caption="Results of the segmentation algorithm
    using an auto-penalized R-squared cost
    function over a subset of Berlin data"
)
plot_results(results, sub_berlin)
source("helper.R")
library(microbenchmark)
data <- makeRandom(20, 100)

bench <- microbenchmark(
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "exact"),
  segment(data,
          cost = function(x) -r_multivariate(x),
          algorithm = "exact"),
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "hierarchical"),
  segment(data,
          cost = function(x) -r_multivariate(x),
          algorithm = "hierarchical"),
  times = 1
)

print_benchmark(
  bench,
  caption="Execution time comparison
    between native C++ and interpreted R code"
)
data <- makeRandom(100, 10)
doMC::registerDoMC(4)
bench <- microbenchmark(
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "exact",
          allow_parallel = FALSE),
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "exact",
          allow_parallel = TRUE),
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "hierarchical",
          allow_parallel = FALSE),
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "hierarchical",
          allow_parallel = TRUE),
  times = 1
)
print_benchmark(
  bench,
  caption="Execution time comparison
    between parallel and single-threaded computation
    with a data set of 100 samples and 10 columns"
)
data <- makeRandom(5, 100)
doMC::registerDoMC(4)
bench <- microbenchmark(
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "exact",
          allow_parallel = FALSE),
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "exact",
          allow_parallel = TRUE),
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "hierarchical",
          allow_parallel = FALSE),
  segment(data,
          cost = function(x) -multivariate(x),
          algorithm = "hierarchical",
          allow_parallel = TRUE),
  times = 1
)

print_benchmark(
  bench,
  caption="Execution time comparison
    between parallel and single-threaded computation
    with a data set of 5 samples and 100 columns"
)
install.packages("segmentr")
library("segmentr")

data <- rbind(
  c(1, 1, 0, 0, 0, 1023, 134521, 12324),
  c(1, 1, 0, 0, 0, -20941, 1423, 14334),
  c(1, 1, 0, 0, 0, 2398439, 1254, 146324),
  c(1, 1, 0, 0, 0, 24134, 1, 15323),
  c(1, 1, 0, 0, 0, -231, 1256, 13445),
  c(1, 1, 0, 0, 0, 10000, 1121, 331)
)

segment(
  data,
  algorithm = "exact",
  cost = function(X) -multivariate(X) + 0.01*exp(ncol(X))
)
vignette("segmentr")

Castro, Bruno M., Renan B. Lemes, Jonatas Cesar, Tábita Hünemeier, and Florencia Leonardi. 2018. “A Model Selection Approach for Multiple Sequence Segmentation and Dimensionality Reduction.” Journal of Multivariate Analysis 167: 319–30. https://doi.org/https://doi.org/10.1016/j.jmva.2018.05.006.

Ceballos, Francisco, Peter Joshi, David W. Clark, Michèle Ramsay, and James F. Wilson. 2018. “Runs of Homozygosity: Windows into Population History and Trait Architecture.” Nature Reviews Genetics 19 (January). https://doi.org/10.1038/nrg.2017.109.

Eddelbuettel, Dirk, and Romain François. 2011. “Rcpp: Seamless R and C++ Integration.” Journal of Statistical Software 40 (8): 1–18. https://doi.org/10.18637/jss.v040.i08.

Maidstone, Robert, Toby Hocking, Guillem Rigaill, and Paul Fearnhead. 2017. “On Optimal Multiple Changepoint Algorithms for Large Data.” Statistics and Computing 27 (2): 519–33. https://doi.org/10.1007/s11222-016-9636-3.

Mello, Thales. 2019. “Segmentr: An R Package to Segment Data with Minimal Total Cost (Aka Change Points).” GitHub Repository. https://github.com/thalesmello/segmentr; GitHub.

Mello, Thales, and Florencia Leonardi. 2019. Segmentr: Segment Data Minimizing a Cost Function. https://CRAN.R-project.org/package=segmentr.

Munkres, J. R. 2000. Topology. Featured Titles for Topology Series. Prentice Hall, Incorporated. https://books.google.com.br/books?id=XjoZAQAAIAAJ.

Park, K. I. 2017. Fundamentals of Probability and Stochastic Processes with Applications to Communications. Springer International Publishing. https://books.google.com.br/books?id=cd2SswEACAAJ.

R Core Team. 2018. “R: A Language and Environment for Statistical Computing.” Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.

Wetterdienst, Deutscher. 2019. “Climate Data Center - Ftp Server.” https://www.dwd.de/EN/climate_environment/cdc/cdc.html; Deutscher Wetterdienst.

Wickham, Hadley. 2015. R Packages. 1st ed. O’Reilly Media, Inc. http://r-pkgs.had.co.nz/.