diff --git a/DESCRIPTION b/DESCRIPTION index dee50fe..e5c2960 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,3 +16,4 @@ Suggests: testthat License: GPL-3 LazyData: true Packaged: 2014-12-08 07:54:07 UTC; owenvallis +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index b6fa8d9..18dfab3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.0): do not edit by hand +# Generated by roxygen2: do not edit by hand export(AnomalyDetectionTs) export(AnomalyDetectionVec) diff --git a/R/date_utils.R b/R/date_utils.R index f46ba58..bc4cd52 100644 --- a/R/date_utils.R +++ b/R/date_utils.R @@ -1,40 +1,41 @@ format_timestamp <- function(indf, index = 1) { - if (class(indf[[index]])[1] == "POSIXlt") { + if (class(indf[[index]])[1] == "POSIXct") { return(indf) } if (stringr::str_detect(indf[[index]][1], "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2} \\+\\d{4}$")) { - indf[[index]] <- strptime(indf[[index]], format="%Y-%m-%d %H:%M:%S", tz="UTC") + indf[[index]] <- strptime(indf[[index]], format = "%Y-%m-%d %H:%M:%S", tz = "UTC") } else if (stringr::str_detect(indf[[index]][1], "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}$")) { - indf[[index]] <- strptime(indf[[index]], format="%Y-%m-%d %H:%M:%S", tz="UTC") + indf[[index]] <- strptime(indf[[index]], format = "%Y-%m-%d %H:%M:%S", tz = "UTC") } else if (stringr::str_detect(indf[[index]][1], "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}$")) { - indf[[index]] <- strptime(indf[[index]], format="%Y-%m-%d %H:%M", tz="UTC") + indf[[index]] <- strptime(indf[[index]], format = "%Y-%m-%d %H:%M", tz = "UTC") } else if (stringr::str_detect(indf[[index]][1], "^\\d{2}/\\d{2}/\\d{2}$")) { - indf[[index]] <- strptime(indf[[index]], format="%m/%d/%y", tz="UTC") + indf[[index]] <- strptime(indf[[index]], format = "%m/%d/%y", tz = "UTC") } else if (stringr::str_detect(indf[[index]][1], "^\\d{2}/\\d{2}/\\d{4}$")) { - indf[[index]] <- strptime(indf[[index]], format="%m/%d/%Y", tz="UTC") + indf[[index]] <- strptime(indf[[index]], format = "%m/%d/%Y", tz = "UTC") } else if (stringr::str_detect(indf[[index]][1], "^\\d{4}\\d{2}\\d{2}$")) { - indf[[index]] <- strptime(indf[[index]], format="%Y%m%d", tz="UTC") + indf[[index]] <- strptime(indf[[index]], format = "%Y%m%d", tz = "UTC") } else if (stringr::str_detect(indf[[index]][1], "^\\d{4}/\\d{2}/\\d{2}/\\d{2}$")) { - indf[[index]] <- strptime(indf[[index]], format="%Y/%m/%d/%H", tz="UTC") + indf[[index]] <- strptime(indf[[index]], format = "%Y/%m/%d/%H", tz = "UTC") } else if (stringr::str_detect(indf[[index]][1], "^\\d{10}$")) { # Handle Unix seconds in milliseconds - indf[[index]] <- as.POSIXlt(indf[[index]], origin="1970-01-01", tz="UTC") + indf[[index]] <- as.POSIXlt(indf[[index]], origin = "1970-01-01", tz = "UTC") } + indf[[index]] <- as.POSIXct(indf[[index]], tz = "UTC") return(indf) } get_gran = function(tsdf, index=1) { n = length(tsdf[[index]]) # We calculate the granularity from the time difference between the last 2 entries (sorted) - gran = round(difftime(max(tsdf[[index]]), sort(tsdf[[index]], partial = n-1)[n-1], + gran = round(difftime(max(tsdf[[index]]), sort(tsdf[[index]], partial = n - 1)[n - 1], units = "secs")) if (gran >= 86400) { diff --git a/R/ts_anom_detection.R b/R/ts_anom_detection.R index a7e3abe..9571523 100644 --- a/R/ts_anom_detection.R +++ b/R/ts_anom_detection.R @@ -67,14 +67,14 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', title = NULL, verbose=FALSE, na.rm = FALSE){ # Check for supported inputs types - if(!is.data.frame(x)){ + if (!is.data.frame(x)) { stop("data must be a single data frame.") } else { - if(ncol(x) != 2 || !is.numeric(x[[2]])){ + if (ncol(x) != 2 || !is.numeric(x[[2]])) { stop("data must be a 2 column data.frame, with the first column being a set of timestamps, and the second coloumn being numeric values.") } # Format timestamps if necessary - if (!(class(x[[1]])[1] == "POSIXlt")) { + if (!(class(x[[1]])[1] == "POSIXct")) { x <- format_timestamp(x) } } @@ -83,13 +83,13 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', colnames(x) <- c("timestamp", "count") } - if(!is.logical(na.rm)){ + if (!is.logical(na.rm)) { stop("na.rm must be either TRUE (T) or FALSE (F)") } # Deal with NAs in timestamps - if(any(is.na(x$timestamp))){ - if(na.rm){ + if (any(is.na(x$timestamp))) { + if (na.rm) { x <- x[-which(is.na(x$timestamp)), ] } else { stop("timestamp contains NAs, please set na.rm to TRUE or remove the NAs manually.") @@ -97,53 +97,53 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', } # Sanity check all input parameters - if(max_anoms > .49){ + if (max_anoms > .49) { stop(paste("max_anoms must be less than 50% of the data points (max_anoms =", round(max_anoms*length(x[[2]]), 0), " data_points =", length(x[[2]]),").")) - } else if(max_anoms < 0){ + } else if (max_anoms < 0) { stop("max_anoms must be positive.") - } else if(max_anoms == 0){ + } else if (max_anoms == 0) { warning("0 max_anoms results in max_outliers being 0.") } - if(!direction %in% c('pos', 'neg', 'both')){ + if (!direction %in% c('pos', 'neg', 'both')) { stop("direction options are: pos | neg | both.") } - if(!(0.01 <= alpha || alpha <= 0.1)){ - if(verbose) message("Warning: alpha is the statistical signifigance, and is usually between 0.01 and 0.1") + if (!(0.01 <= alpha || alpha <= 0.1)) { + if (verbose) message("Warning: alpha is the statistical signifigance, and is usually between 0.01 and 0.1") } - if(!is.null(only_last) && !only_last %in% c('day','hr')){ + if (!is.null(only_last) && !only_last %in% c('day','hr')) { stop("only_last must be either 'day' or 'hr'") } - if(!threshold %in% c('None','med_max','p95','p99')){ + if (!threshold %in% c('None','med_max','p95','p99')) { stop("threshold options are: None | med_max | p95 | p99.") } - if(!is.logical(e_value)){ + if (!is.logical(e_value)) { stop("e_value must be either TRUE (T) or FALSE (F)") } - if(!is.logical(longterm)){ + if (!is.logical(longterm)) { stop("longterm must be either TRUE (T) or FALSE (F)") } - if(piecewise_median_period_weeks < 2){ + if (piecewise_median_period_weeks < 2) { stop("piecewise_median_period_weeks must be at greater than 2 weeks") } - if(!is.logical(plot)){ + if (!is.logical(plot)) { stop("plot must be either TRUE (T) or FALSE (F)") } - if(!is.logical(y_log)){ + if (!is.logical(y_log)) { stop("y_log must be either TRUE (T) or FALSE (F)") } - if(!is.character(xlabel)){ + if (!is.character(xlabel)) { stop("xlabel must be a string") } - if(!is.character(ylabel)){ + if (!is.character(ylabel)) { stop("ylabel must be a string") } - if(!is.character(title) && !is.null(title)){ + if (!is.character(title) && !is.null(title)) { stop("title must be a string") } - if(is.null(title)){ + if (is.null(title)) { title <- "" } else { - title <- paste(title, " : ", sep="") + title <- paste(title, " : ", sep = "") } # -- Main analysis: Perform S-H-ESD @@ -152,9 +152,9 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', # Although we derive this in S-H-ESD, we also need it to be minutley later on so we do it here first. gran <- get_gran(x, 1) - if(gran == "day"){ + if (gran == "day") { num_days_per_line <- 7 - if(is.character(only_last) && only_last == 'hr'){ + if (is.character(only_last) && only_last == 'hr') { only_last <- 'day' } } else { @@ -162,8 +162,9 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', } # Aggregate data to minutely if secondly - if(gran == "sec"){ - x <- format_timestamp(aggregate(x[2], format(x[1], "%Y-%m-%d %H:%M:00"), eval(parse(text="sum")))) + if (gran == "sec") { + x <- format_timestamp(aggregate(x[2], format(x[1], "%Y-%m-%d %H:%M:00"), + eval(parse(text = "sum")))) } period = switch(gran, @@ -173,17 +174,17 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', day = 7) num_obs <- length(x[[2]]) - if(max_anoms < 1/num_obs){ + if (max_anoms < 1/num_obs) { max_anoms <- 1/num_obs } # -- Setup for longterm time series # If longterm is enabled, break the data into subset data frames and store in all_data - if(longterm){ + if (longterm) { # Pre-allocate list with size equal to the number of piecewise_median_period_weeks chunks in x + any left over chunk # handle edge cases for daily and single column data period lengths - if(gran == "day"){ + if (gran == "day") { # STL needs 2*period + 1 observations num_obs_in_period <- period*piecewise_median_period_weeks + 1 num_days_in_period <- (7*piecewise_median_period_weeks) + 1 @@ -195,62 +196,62 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', # Store last date in time series last_date <- x[[1]][num_obs] - all_data <- vector(mode="list", length=ceiling(length(x[[1]])/(num_obs_in_period))) + all_data <- vector(mode = "list", length = ceiling(length(x[[1]])/(num_obs_in_period))) # Subset x into piecewise_median_period_weeks chunks - for(j in seq(1,length(x[[1]]), by=num_obs_in_period)){ + for (j in seq(1, length(x[[1]]), by = num_obs_in_period)) { start_date <- x[[1]][j] end_date <- min(start_date + lubridate::days(num_days_in_period), x[[1]][length(x[[1]])]) # if there is at least 14 days left, subset it, otherwise subset last_date - 14days - if(difftime(end_date, start_date, units = "days") == as.difftime(num_days_in_period, units="days")){ + if (difftime(end_date, start_date, units = "days") == as.difftime(num_days_in_period, units = "days")) { all_data[[ceiling(j/(num_obs_in_period))]] <- subset(x, x[[1]] >= start_date & x[[1]] < end_date) - }else{ - all_data[[ceiling(j/(num_obs_in_period))]] <- subset(x, x[[1]] > (last_date-lubridate::days(num_days_in_period)) & x[[1]] <= last_date) + } else { + all_data[[ceiling(j/(num_obs_in_period))]] <- subset(x, x[[1]] > (last_date - lubridate::days(num_days_in_period)) & x[[1]] <= last_date) } } - }else{ + } else { # If longterm is not enabled, then just overwrite all_data list with x as the only item all_data <- list(x) } # Create empty data frames to store all anoms and seasonal+trend component from decomposition - all_anoms <- data.frame(timestamp=numeric(0), count=numeric(0)) - seasonal_plus_trend <- data.frame(timestamp=numeric(0), count=numeric(0)) + all_anoms <- data.frame(timestamp = numeric(0), count = numeric(0)) + seasonal_plus_trend <- data.frame(timestamp = numeric(0), count = numeric(0)) # Detect anomalies on all data (either entire data in one-pass, or in 2 week blocks if longterm=TRUE) - for(i in 1:length(all_data)) { + for (i in 1:length(all_data)) { anomaly_direction = switch(direction, - "pos" = data.frame(one_tail=TRUE, upper_tail=TRUE), # upper-tail only (positive going anomalies) - "neg" = data.frame(one_tail=TRUE, upper_tail=FALSE), # lower-tail only (negative going anomalies) - "both" = data.frame(one_tail=FALSE, upper_tail=TRUE)) # Both tails. Tail direction is not actually used. + "pos" = data.frame(one_tail = TRUE, upper_tail = TRUE), # upper-tail only (positive going anomalies) + "neg" = data.frame(one_tail = TRUE, upper_tail = FALSE), # lower-tail only (negative going anomalies) + "both" = data.frame(one_tail = FALSE, upper_tail = TRUE)) # Both tails. Tail direction is not actually used. # detect_anoms actually performs the anomaly detection and returns the results in a list containing the anomalies # as well as the decomposed components of the time series for further analysis. - s_h_esd_timestamps <- detect_anoms(all_data[[i]], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=TRUE, use_esd=FALSE, - one_tail=anomaly_direction$one_tail, upper_tail=anomaly_direction$upper_tail, verbose=verbose) + s_h_esd_timestamps <- detect_anoms(all_data[[i]], k = max_anoms, alpha = alpha, num_obs_per_period = period, use_decomp = TRUE, use_esd = FALSE, + one_tail = anomaly_direction$one_tail, upper_tail = anomaly_direction$upper_tail, verbose = verbose) # store decomposed components in local variable and overwrite s_h_esd_timestamps to contain only the anom timestamps data_decomp <- s_h_esd_timestamps$stl s_h_esd_timestamps <- s_h_esd_timestamps$anoms # -- Step 3: Use detected anomaly timestamps to extract the actual anomalies (timestamp and value) from the data - if(!is.null(s_h_esd_timestamps)){ + if (!is.null(s_h_esd_timestamps)) { anoms <- subset(all_data[[i]], (all_data[[i]][[1]] %in% s_h_esd_timestamps)) } else { - anoms <- data.frame(timestamp=numeric(0), count=numeric(0)) + anoms <- data.frame(timestamp = numeric(0), count = numeric(0)) } # Filter the anomalies using one of the thresholding functions if applicable - if(threshold != "None"){ + if (threshold != "None") { # Calculate daily max values - periodic_maxs <- tapply(x[[2]],as.Date(x[[1]]),FUN=max) + periodic_maxs <- tapply(x[[2]], as.Date(x[[1]]), FUN = max) # Calculate the threshold set by the user - if(threshold == 'med_max'){ + if (threshold == 'med_max') { thresh <- median(periodic_maxs) - }else if (threshold == 'p95'){ + } else if (threshold == 'p95') { thresh <- quantile(periodic_maxs, .95) - }else if (threshold == 'p99'){ + } else if (threshold == 'p99') { thresh <- quantile(periodic_maxs, .99) } # Remove any anoms below the threshold @@ -265,20 +266,20 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', seasonal_plus_trend <- seasonal_plus_trend[!duplicated(seasonal_plus_trend[[1]]), ] # -- If only_last was set by the user, create subset of the data that represent the most recent day - if(!is.null(only_last)){ - start_date <- x[[1]][num_obs]-lubridate::days(7) - start_anoms <- x[[1]][num_obs]-lubridate::days(1) - if(gran == "day"){ + if (!is.null(only_last)) { + start_date <- x[[1]][num_obs] - lubridate::days(7) + start_anoms <- x[[1]][num_obs] - lubridate::days(1) + if (gran == "day") { #TODO: This might be better set up top at the gran check breaks <- 3*12 num_days_per_line <- 7 } else { - if(only_last == 'day'){ + if (only_last == 'day') { breaks <- 12 - }else{ + } else { # We need to change start_date and start_anoms for the hourly only_last option - start_date <- lubridate::floor_date(x[[1]][num_obs]-lubridate::days(2), "day") - start_anoms <- x[[1]][num_obs]-lubridate::hours(1) + start_date <- lubridate::floor_date(x[[1]][num_obs] - lubridate::days(2), "day") + start_anoms <- x[[1]][num_obs] - lubridate::hours(1) breaks <- 3 } } @@ -295,70 +296,87 @@ AnomalyDetectionTs <- function(x, max_anoms = 0.10, direction = 'pos', anom_pct <- (length(all_anoms[[2]]) / num_obs) * 100 # If there are no anoms, then let's exit - if(anom_pct == 0){ - if(verbose) message("No anomalies detected.") - return (list("anoms"=data.frame(), "plot"=plot.new())) + if (anom_pct == 0) { + if (verbose) message("No anomalies detected.") + return(list("anoms" = data.frame(), "plot" = plot.new())) } - if(plot){ + if (plot) { # -- Build title for plots utilizing parameters set by user - plot_title <- paste(title, round(anom_pct, digits=2), "% Anomalies (alpha=", alpha, ", direction=", direction,")", sep="") - if(longterm){ - plot_title <- paste(plot_title, ", longterm=T", sep="") + plot_title <- paste(title, round(anom_pct, digits = 2), "% Anomalies (alpha=", alpha, ", direction=", direction,")", sep = "") + if (longterm) { + plot_title <- paste(plot_title, ", longterm=T", sep = "") } # -- Plot raw time series data - color_name <- paste("\"", title, "\"", sep="") + color_name <- paste("\"", title, "\"", sep = "") alpha <- 0.8 - if(!is.null(only_last)){ - xgraph <- ggplot2::ggplot(x_subset_week, ggplot2::aes_string(x="timestamp", y="count")) + ggplot2::theme_bw() + ggplot2::theme(panel.grid.major = ggplot2::element_blank(), panel.grid.minor = ggplot2::element_blank(), text=ggplot2::element_text(size = 14)) - xgraph <- xgraph + ggplot2::geom_line(data=x_subset_week, ggplot2::aes_string(colour=color_name), alpha=alpha*.33) + ggplot2::geom_line(data=x_subset_single_day, ggplot2::aes_string(color=color_name), alpha=alpha) - week_rng = get_range(x_subset_week, index=2, y_log=y_log) - day_rng = get_range(x_subset_single_day, index=2, y_log=y_log) - yrange = c(min(week_rng[1],day_rng[1]), max(week_rng[2],day_rng[2])) - xgraph <- add_day_labels_datetime(xgraph, breaks=breaks, start=as.POSIXlt(min(x_subset_week[[1]]), tz="UTC"), end=as.POSIXlt(max(x_subset_single_day[[1]]), tz="UTC"), days_per_line=num_days_per_line) - xgraph <- xgraph + ggplot2::labs(x=xlabel, y=ylabel, title=plot_title) - }else{ - xgraph <- ggplot2::ggplot(x, ggplot2::aes_string(x="timestamp", y="count")) + ggplot2::theme_bw() + ggplot2::theme(panel.grid.major = ggplot2::element_line(colour = "gray60"), panel.grid.major.y = ggplot2::element_blank(), panel.grid.minor = ggplot2::element_blank(), text=ggplot2::element_text(size = 14)) - xgraph <- xgraph + ggplot2::geom_line(data=x, ggplot2::aes_string(colour=color_name), alpha=alpha) - yrange <- get_range(x, index=2, y_log=y_log) - xgraph <- xgraph + ggplot2::scale_x_datetime(labels=function(x) ifelse(as.POSIXlt(x, tz="UTC")$hour != 0,strftime(x, format="%kh", tz="UTC"), strftime(x, format="%b %e", tz="UTC")), - expand=c(0,0)) - xgraph <- xgraph + ggplot2::labs(x=xlabel, y=ylabel, title=plot_title) + if (!is.null(only_last)) { + xgraph <- ggplot2::ggplot(x_subset_week, ggplot2::aes_string(x = "timestamp", y = "count")) + + ggplot2::theme_bw() + + ggplot2::theme(panel.grid.major = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + text = ggplot2::element_text(size = 14)) + xgraph <- xgraph + + ggplot2::geom_line(data = x_subset_week, ggplot2::aes_string(colour = color_name), alpha = alpha*.33) + + ggplot2::geom_line(data = x_subset_single_day, ggplot2::aes_string(color = color_name), alpha = alpha) + week_rng <- get_range(x_subset_week, index = 2, y_log = y_log) + day_rng <- get_range(x_subset_single_day, index = 2, y_log = y_log) + yrange <- c(min(week_rng[1], day_rng[1]), max(week_rng[2], day_rng[2])) + xgraph <- add_day_labels_datetime(xgraph, breaks = breaks, start = as.POSIXlt(min(x_subset_week[[1]]), tz = "UTC"), end = as.POSIXlt(max(x_subset_single_day[[1]]), tz = "UTC"), days_per_line = num_days_per_line) + xgraph <- xgraph + + ggplot2::labs(x = xlabel, y = ylabel, title = plot_title) + } else { + xgraph <- ggplot2::ggplot(x, ggplot2::aes_string(x = "timestamp", y = "count")) + + ggplot2::theme_bw() + + ggplot2::theme(panel.grid.major = ggplot2::element_line(colour = "gray60"), + panel.grid.major.y = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + text = ggplot2::element_text(size = 14)) + xgraph <- xgraph + + ggplot2::geom_line(data = x, ggplot2::aes_string(colour = color_name), alpha = alpha) + yrange <- get_range(x, index = 2, y_log = y_log) + xgraph <- xgraph + + ggplot2::scale_x_datetime(labels = function(x) ifelse(as.POSIXlt(x, tz = "UTC")$hour != 0, strftime(x, format = "%kh", tz = "UTC"), strftime(x, format = "%b %e", tz = "UTC")), + expand = c(0,0)) + xgraph <- xgraph + + ggplot2::labs(x = xlabel, y = ylabel, title = plot_title) } # Add anoms to the plot as circles. # We add zzz_ to the start of the name to ensure that the anoms are listed after the data sets. - xgraph <- xgraph + ggplot2::geom_point(data=all_anoms, ggplot2::aes_string(color=paste("\"zzz_",title,"\"",sep="")), size = 3, shape = 1) + xgraph <- xgraph + + ggplot2::geom_point(data = all_anoms, ggplot2::aes_string(color = paste("\"zzz_", title, "\"", sep = "")), size = 3, shape = 1) # Hide legend - xgraph <- xgraph + ggplot2::theme(legend.position="none") + xgraph <- xgraph + + ggplot2::theme(legend.position = "none") # Use log scaling if set by user - xgraph <- xgraph + add_formatted_y(yrange, y_log=y_log) - + xgraph <- xgraph + + add_formatted_y(yrange, y_log = y_log) } # Fix to make sure date-time is correct and that we retain hms at midnight - all_anoms[[1]] <- format(all_anoms[[1]], format="%Y-%m-%d %H:%M:%S") + all_anoms[[1]] <- format(all_anoms[[1]], format = "%Y-%m-%d %H:%M:%S") # Store expected values if set by user - if(e_value) { - anoms <- data.frame(timestamp=all_anoms[[1]], anoms=all_anoms[[2]], - expected_value=subset(seasonal_plus_trend[[2]], as.POSIXlt(seasonal_plus_trend[[1]], tz="UTC") %in% all_anoms[[1]]), - stringsAsFactors=FALSE) + if (e_value) { + anoms <- data.frame(timestamp = all_anoms[[1]], anoms = all_anoms[[2]], + expected_value = subset(seasonal_plus_trend[[2]], as.POSIXlt(seasonal_plus_trend[[1]], tz = "UTC") %in% all_anoms[[1]]), + stringsAsFactors = FALSE) } else { - anoms <- data.frame(timestamp=all_anoms[[1]], anoms=all_anoms[[2]], stringsAsFactors=FALSE) + anoms <- data.frame(timestamp = all_anoms[[1]], anoms = all_anoms[[2]], stringsAsFactors = FALSE) } - # Make sure we're still a valid POSIXlt datetime. + # Make sure we're still a valid POSIXct datetime. # TODO: Make sure we keep original datetime format and timezone. - anoms$timestamp <- as.POSIXlt(anoms$timestamp, tz="UTC") + anoms$timestamp <- as.POSIXct(anoms$timestamp, tz = "UTC") # Lastly, return anoms and optionally the plot if requested by the user - if(plot){ - return (list(anoms = anoms, plot = xgraph)) + if (plot) { + return(list(anoms = anoms, plot = xgraph)) } else { - return (list(anoms = anoms, plot = plot.new())) + return(list(anoms = anoms, plot = plot.new())) } } diff --git a/data/raw_data.rda b/data/raw_data.rda index ea704e5..6d760ca 100644 Binary files a/data/raw_data.rda and b/data/raw_data.rda differ diff --git a/man/AnomalyDetectionTs.Rd b/man/AnomalyDetectionTs.Rd index 8b3b000..e4516b9 100644 --- a/man/AnomalyDetectionTs.Rd +++ b/man/AnomalyDetectionTs.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/ts_anom_detection.R \docType{data} \name{AnomalyDetectionTs} @@ -9,7 +9,7 @@ AnomalyDetectionTs(x, max_anoms = 0.1, direction = "pos", alpha = 0.05, only_last = NULL, threshold = "None", e_value = FALSE, longterm = FALSE, piecewise_median_period_weeks = 2, plot = FALSE, y_log = FALSE, xlabel = "", ylabel = "count", title = NULL, - verbose = FALSE) + verbose = FALSE, na.rm = FALSE) } \arguments{ \item{x}{Time series as a two column data frame where the first column consists of the @@ -49,7 +49,9 @@ large positive anomalies relative to the rest of the data.} \item{title}{Title for the output plot.} -\item{verbose}{Enable debug messages} +\item{verbose}{Enable debug messages.} + +\item{na.rm}{Remove any NAs in timestamps.(default: FALSE)} } \value{ The returned value is a list with the following components. @@ -94,4 +96,3 @@ Rosner, B., (May 1983), "Percentage Points for a Generalized ESD Many-Outlier Pr \code{\link{AnomalyDetectionVec}} } \keyword{datasets} - diff --git a/man/AnomalyDetectionVec.Rd b/man/AnomalyDetectionVec.Rd index 9aeb3f1..18841e0 100644 --- a/man/AnomalyDetectionVec.Rd +++ b/man/AnomalyDetectionVec.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/vec_anom_detection.R \docType{data} \name{AnomalyDetectionVec} @@ -11,29 +11,29 @@ AnomalyDetectionVec(x, max_anoms = 0.1, direction = "pos", alpha = 0.05, ylabel = "count", title = NULL, verbose = FALSE) } \arguments{ -\item{x}{Time series as a column data frame, list, or vector, where the column consists of +\item{x}{Time series as a column data frame, list, or vector, where the column consists of the observations.} \item{max_anoms}{Maximum number of anomalies that S-H-ESD will detect as a percentage of the data.} -\item{direction}{Directionality of the anomalies to be detected. Options are: +\item{direction}{Directionality of the anomalies to be detected. Options are: \code{'pos' | 'neg' | 'both'}.} \item{alpha}{The level of statistical significance with which to accept or reject anomalies.} -\item{period}{Defines the number of observations in a single period, and used during seasonal +\item{period}{Defines the number of observations in a single period, and used during seasonal decomposition.} \item{only_last}{Find and report anomalies only within the last period in the time series.} -\item{threshold}{Only report positive going anoms above the threshold specified. Options are: +\item{threshold}{Only report positive going anoms above the threshold specified. Options are: \code{'None' | 'med_max' | 'p95' | 'p99'}.} \item{e_value}{Add an additional column to the anoms output containing the expected value.} -\item{longterm_period}{Defines the number of observations for which the trend can be considered -flat. The value should be an integer multiple of the number of observations in a single period. +\item{longterm_period}{Defines the number of observations for which the trend can be considered +flat. The value should be an integer multiple of the number of observations in a single period. This increases anom detection efficacy for time series that are greater than a month.} \item{plot}{A flag indicating if a plot with both the time series and the estimated anoms, @@ -58,33 +58,33 @@ The returned value is a list with the following components. \item{plot}{A graphical object if plotting was requested by the user. The plot contains the estimated anomalies annotated on the input time series.} -One can save \code{anoms} to a file in the following fashion: +One can save \code{anoms} to a file in the following fashion: \code{write.csv([["anoms"]], file=)} -One can save \code{plot} to a file in the following fashion: +One can save \code{plot} to a file in the following fashion: \code{ggsave(, plot=[["plot"]])} } \description{ -A technique for detecting anomalies in seasonal univariate time series where the input is a +A technique for detecting anomalies in seasonal univariate time series where the input is a series of observations. } \details{ \code{longterm_period} This option should be set when the input time series is longer than a month. The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014).\cr\cr -\code{threshold} Filter all negative anomalies and those anomalies whose magnitude is smaller -than one of the specified thresholds which include: the median -of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the +\code{threshold} Filter all negative anomalies and those anomalies whose magnitude is smaller +than one of the specified thresholds which include: the median +of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the 99th percentile of the daily max values (p99). } \examples{ data(raw_data) AnomalyDetectionVec(raw_data[,2], max_anoms=0.02, period=1440, direction='both', plot=TRUE) # To detect only the anomalies in the last period, run the following: -AnomalyDetectionVec(raw_data[,2], max_anoms=0.02, period=1440, direction='both', +AnomalyDetectionVec(raw_data[,2], max_anoms=0.02, period=1440, direction='both', only_last=TRUE, plot=TRUE) } \references{ -Vallis, O., Hochenbaum, J. and Kejariwal, A., (2014) "A Novel Technique for +Vallis, O., Hochenbaum, J. and Kejariwal, A., (2014) "A Novel Technique for Long-Term Anomaly Detection in the Cloud", 6th USENIX, Philadelphia, PA. Rosner, B., (May 1983), "Percentage Points for a Generalized ESD Many-Outlier Procedure" @@ -94,4 +94,3 @@ Rosner, B., (May 1983), "Percentage Points for a Generalized ESD Many-Outlier Pr \code{\link{AnomalyDetectionTs}} } \keyword{datasets} - diff --git a/man/raw_data.Rd b/man/raw_data.Rd index 00e9fb9..6043d92 100644 --- a/man/raw_data.Rd +++ b/man/raw_data.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/raw_data.R \docType{data} \name{raw_data} @@ -10,4 +10,3 @@ data(raw_data) \description{ A data frame containing a time series with headings timestamp and count. } -