diff --git a/libbpf-tools/biotop.c b/libbpf-tools/biotop.c index 62995958281a..897a476cb610 100644 --- a/libbpf-tools/biotop.c +++ b/libbpf-tools/biotop.c @@ -31,6 +31,8 @@ #define warn(...) fprintf(stderr, __VA_ARGS__) #define OUTPUT_ROWS_LIMIT 10240 +#define OPT_OUTPUT 1 /* --output */ + enum SORT { ALL, IO, @@ -50,6 +52,11 @@ struct vector { void **elems; }; +struct data_t { + struct info_t key; + struct val_t value; +}; + int grow_vector(struct vector *vector) { if (vector->nr >= vector->capacity) { void **reallocated; @@ -87,6 +94,8 @@ static int interval = 1; static int count = 99999999; static pid_t target_pid = 0; static bool verbose = false; +enum output_format output = 0; +static struct data_t datas[OUTPUT_ROWS_LIMIT]; const char *argp_program_version = "biotop 0.1"; const char *argp_program_bug_address = @@ -107,6 +116,7 @@ static const struct argp_option opts[] = { { "rows", 'r', "ROWS", 0, "Maximum rows to print, default 20", 0 }, { "pid", 'p', "PID", 0, "Process ID to trace", 0 }, { "verbose", 'v', NULL, 0, "Verbose debug output", 0 }, + { "output", OPT_OUTPUT, "FORMAT", OPTION_ARG_OPTIONAL, "Output metrics in specified format (currently only 'line' supported)", 0 }, { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help", 0 }, {}, }; @@ -160,6 +170,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'h': argp_state_help(state, stderr, ARGP_HELP_STD_HELP); break; + case OPT_OUTPUT: + output = FORMAT_LINE_PROTOCOL; + if (arg) { + if (strcmp(arg, "line") == 0) + output = FORMAT_LINE_PROTOCOL; + else + argp_error(state, "Invalid output format: %s. " + "Only 'line' is supported.", arg); + } + break; case ARGP_KEY_ARG: errno = 0; if (pos_args == 0) { @@ -198,11 +218,6 @@ static void sig_int(int signo) exiting = 1; } -struct data_t { - struct info_t key; - struct val_t value; -}; - static int sort_column(const void *obj1, const void *obj2) { struct data_t *d1 = (struct data_t *) obj1; @@ -299,13 +314,58 @@ static int read_stat(struct biotop_bpf *obj, struct data_t *datas, __u32 *count) return 0; } +static int print_metrics(struct biotop_bpf *obj) +{ + int i, err = 0, rows = OUTPUT_ROWS_LIMIT; + time_t ts = time(NULL); + struct metric m = { + .name = "biotop", + .tags = {{ "pid", "" }}, + .nr_tags = 1, + .fields = { + { "I/O", 0 }, + { "Kbytes", 0}, + { "AVGms", 0} + }, + .nr_fields = 3, + .ts = ts + }; + + err = read_stat(obj, datas, (__u32*) &rows); + if (err) { + fprintf(stderr, "read stat failed: %s\n", strerror(errno)); + return err; + } + + for (i = 0; i < rows; i++) { + struct info_t *key = &datas[i].key; + struct val_t *value = &datas[i].value; + float avg_ms = 0; + + /* Tag */ + snprintf(m.tags[0].value, sizeof(m.tags[0].value), "%u", key->pid); + + /* To avoid floating point exception. */ + if (value->io) + avg_ms = ((float) value->us) / 1000 / value->io; + + /* Fields */ + m.fields[0].value = value->io; + m.fields[1].value = value->bytes; + m.fields[2].value = avg_ms; + + print_metric(&m, output); + } + + return 0; +} + static int print_stat(struct biotop_bpf *obj) { FILE *f; time_t t; struct tm *tm; char ts[16], buf[256]; - static struct data_t datas[OUTPUT_ROWS_LIMIT]; int n, i, err = 0, rows = OUTPUT_ROWS_LIMIT; f = fopen("/proc/loadavg", "r"); @@ -319,6 +379,7 @@ static int print_stat(struct biotop_bpf *obj) printf("%8s loadavg: %s\n", ts, buf); fclose(f); } + printf("%-7s %-16s %1s %-3s %-3s %-8s %5s %7s %6s\n", "PID", "COMM", "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"); @@ -448,15 +509,21 @@ int main(int argc, char **argv) while (1) { sleep(interval); - if (clear_screen) { - err = system("clear"); + if (output) { + err = print_metrics(obj); if (err) goto cleanup; - } + } else { + if (clear_screen) { + err = system("clear"); + if (err) + goto cleanup; + } - err = print_stat(obj); - if (err) - goto cleanup; + err = print_stat(obj); + if (err) + goto cleanup; + } count--; if (exiting || !count) diff --git a/libbpf-tools/memleak.c b/libbpf-tools/memleak.c index a42f61b9d961..330a15159c20 100644 --- a/libbpf-tools/memleak.c +++ b/libbpf-tools/memleak.c @@ -51,6 +51,7 @@ static struct env { bool verbose; char command[32]; char symbols_prefix[16]; + enum output_format output; } env = { .interval = 5, // posarg 1 .nr_intervals = -1, // posarg 2 @@ -90,6 +91,7 @@ struct allocation { #define OPT_PERF_MAX_STACK_DEPTH 1 /* --perf-max-stack-depth */ #define OPT_STACK_STORAGE_SIZE 2 /* --stack-storage-size */ +#define OPT_OUTPUT 3 /* --output */ #define __ATTACH_UPROBE(skel, sym_name, prog_name, is_retprobe) \ do { \ @@ -223,6 +225,7 @@ static const struct argp_option argp_options[] = { "the number of unique stack traces that can be stored and displayed (default 10240)", 0 }, {"perf-max-stack-depth", OPT_PERF_MAX_STACK_DEPTH, "PERF-MAX-STACK-DEPTH", 0, "the limit for both kernel and user stack traces (default 127)", 0 }, + {"output", OPT_OUTPUT, "FORMAT", OPTION_ARG_OPTIONAL, "Output metrics in specified format (currently only 'line' supported)", 0 }, {"verbose", 'v', NULL, 0, "verbose debug output", 0 }, {}, }; @@ -467,7 +470,8 @@ int main(int argc, char *argv[]) } #endif - print_headers(); + if (!env.output) + print_headers(); // main loop while (!exiting && env.nr_intervals) { @@ -603,6 +607,16 @@ error_t argp_parse_arg(int key, char *arg, struct argp_state *state) argp_usage(state); } break; + case OPT_OUTPUT: + env.output = FORMAT_LINE_PROTOCOL; + if (arg) { + if (strcmp(arg, "line") == 0) + env.output = FORMAT_LINE_PROTOCOL; + else + argp_error(state, "Invalid output format: %s. " + "Only 'line' is supported.", arg); + } + break; case ARGP_KEY_ARG: pos_args++; @@ -828,6 +842,27 @@ void print_stack_frames_by_syms_cache() } #endif +static void print_metrics(struct allocation *allocs, size_t nr_allocs, time_t ts) +{ + for (size_t i = 0; i < nr_allocs; ++i) { + const struct allocation *alloc = &allocs[i]; + struct metric m = { + .name = "memleak", + .tags = {{ "stackid", "" }}, + .nr_tags = 1, + .fields = { + { "size", alloc->size }, + { "count", alloc->count } + }, + .nr_fields = 2, + .ts = ts + }; + snprintf(m.tags[0].value, sizeof(m.tags[0].value), "%lu", alloc->stack_id); + + print_metric(&m, env.output); + } +} + int print_stack_frames(struct allocation *allocs, size_t nr_allocs, int stack_traces_fd) { for (size_t i = 0; i < nr_allocs; ++i) { @@ -977,6 +1012,11 @@ int print_outstanding_allocs(int allocs_fd, int stack_traces_fd) nr_allocs++; } + if (env.output) { + print_metrics(allocs, nr_allocs, t); + goto cleanup; + } + // sort the allocs array in descending order qsort(allocs, nr_allocs, sizeof(allocs[0]), alloc_size_compare); @@ -988,6 +1028,7 @@ int print_outstanding_allocs(int allocs_fd, int stack_traces_fd) print_stack_frames(allocs, nr_allocs_to_show, stack_traces_fd); +cleanup: // Reset allocs list so that we dont accidentaly reuse data the next time we call this function for (size_t i = 0; i < nr_allocs; i++) { allocs[i].stack_id = 0; @@ -1067,6 +1108,11 @@ int print_outstanding_combined_allocs(int combined_allocs_fd, int stack_traces_f nr_allocs++; } + if (env.output) { + print_metrics(allocs, nr_allocs, t); + goto cleanup; + } + qsort(allocs, nr_allocs, sizeof(allocs[0]), alloc_size_compare); // get min of allocs we stored vs the top N requested stacks @@ -1077,6 +1123,7 @@ int print_outstanding_combined_allocs(int combined_allocs_fd, int stack_traces_f print_stack_frames(allocs, nr_allocs, stack_traces_fd); +cleanup: if (nr_missing_stacks > 0) { fprintf(stderr, "WARNING: %zu stack traces could not be displayed" " due to memory shortage, including %zu caused by hash collisions." diff --git a/libbpf-tools/syscall_helpers.c b/libbpf-tools/syscall_helpers.c index 6a3892b9ad97..36590cf4ef81 100644 --- a/libbpf-tools/syscall_helpers.c +++ b/libbpf-tools/syscall_helpers.c @@ -810,7 +810,7 @@ static const char *syscall_names_generic[] = { size_t syscall_names_generic_size = sizeof(syscall_names_generic)/sizeof(char*); #endif -void syscall_name(unsigned n, char *buf, size_t size) +int syscall_name(unsigned n, char *buf, size_t size) { const char *name = NULL; @@ -824,10 +824,13 @@ void syscall_name(unsigned n, char *buf, size_t size) name = syscall_names_generic[n]; #endif - if (name) + if (name) { strncpy(buf, name, size-1); - else + return 0; + } else { snprintf(buf, size, "[unknown: %u]", n); + return -EINVAL; + } } int list_syscalls(void) diff --git a/libbpf-tools/syscall_helpers.h b/libbpf-tools/syscall_helpers.h index 06f296555fa5..13a53ad22a38 100644 --- a/libbpf-tools/syscall_helpers.h +++ b/libbpf-tools/syscall_helpers.h @@ -7,6 +7,6 @@ void init_syscall_names(void); void free_syscall_names(void); void list_syscalls(void); -void syscall_name(unsigned n, char *buf, size_t size); +int syscall_name(unsigned n, char *buf, size_t size); #endif /* __SYSCALL_HELPERS_H */ diff --git a/libbpf-tools/syscount.c b/libbpf-tools/syscount.c index 76e550918788..26fece99f8a8 100644 --- a/libbpf-tools/syscount.c +++ b/libbpf-tools/syscount.c @@ -25,6 +25,7 @@ struct data_ext_t { __u32 key; }; +#define OPT_OUTPUT 1 /* --output */ #define warn(...) fprintf(stderr, __VA_ARGS__) @@ -60,6 +61,7 @@ static const struct argp_option opts[] = { { "errno", 'e', "ERRNO", 0, "Trace only syscalls that return this error" "(numeric or EPERM, etc.)", 0 }, { "list", 'l', NULL, 0, "Print list of recognized syscalls and exit", 0 }, + { "output", OPT_OUTPUT, "FORMAT", OPTION_ARG_OPTIONAL, "Output metrics in specified format (currently only 'line' supported)", 0 }, { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help", 0 }, {}, }; @@ -78,6 +80,7 @@ static struct env { pid_t pid; char *cgroupspath; bool cg; + enum output_format output; } env = { .top = 10, }; @@ -152,6 +155,45 @@ static void print_count_header(void) printf("%-22s %8s\n", agg_colname(), "COUNT"); } +static void print_metrics(struct data_ext_t *datas, size_t count) +{ + time_t ts = time(NULL); + char buf[2 * TASK_COMM_LEN]; + struct data_ext_t *data; + int err = 0; + int i; + struct metric m = { + .name = "syscount", + .nr_tags = 1, + .ts = ts + }; + + for (i = 0; i < count; i++) { + data = &datas[i]; + + /* Tag */ + m.tags[0].key = !env.process ? "syscall" : "pid"; + if (!env.process) { + err = syscall_name(data->key, buf, sizeof(buf)); + snprintf(m.tags[0].value, sizeof(m.tags[0].value), "%s", err ? "unknown" : buf); + } else { + snprintf(m.tags[0].value, sizeof(m.tags[0].value), "%u", data->key); + } + + /* Fields */ + m.fields[0].key = "count"; + m.fields[0].value = data->count; + m.nr_fields = 1; + if (env.latency) { + m.fields[m.nr_fields].key = "latency"; + m.fields[m.nr_fields].value = data->total_ns; + m.nr_fields++; + } + + print_metric(&m, env.output); + } +} + static void print_latency(struct data_ext_t *vals, size_t count) { double div = env.milliseconds ? 1000000.0 : 1000.0; @@ -361,6 +403,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'l': env.list_syscalls = true; break; + case OPT_OUTPUT: + env.output = FORMAT_LINE_PROTOCOL; + if (arg) { + if (strcmp(arg, "line") == 0) + env.output = FORMAT_LINE_PROTOCOL; + else + argp_error(state, "Invalid output format: %s. " + "Only 'line' is supported.", arg); + } + break; default: return ARGP_ERR_UNKNOWN; } @@ -474,9 +526,14 @@ int main(int argc, char **argv) } compar = env.latency ? compar_latency : compar_count; - print = env.latency ? print_latency : print_count; + if (env.output) + print = print_metrics; + else + print = env.latency ? print_latency : print_count; + + if (!env.output) + printf("Tracing syscalls, printing top %d... Ctrl+C to quit.\n", env.top); - printf("Tracing syscalls, printing top %d... Ctrl+C to quit.\n", env.top); while (hang_on) { sleep(env.interval ?: 1); if (env.duration) { @@ -493,8 +550,10 @@ int main(int argc, char **argv) if (!count) continue; - qsort(vals, count, sizeof(vals[0]), compar); - print_timestamp(); + if (!env.output) { + qsort(vals, count, sizeof(vals[0]), compar); + print_timestamp(); + } print(vals, count); } diff --git a/libbpf-tools/trace_helpers.c b/libbpf-tools/trace_helpers.c index 9626e24d710d..cc1815dd0df1 100644 --- a/libbpf-tools/trace_helpers.c +++ b/libbpf-tools/trace_helpers.c @@ -1311,3 +1311,36 @@ int str_to_long(const char *src, void *dest) return errno; } + +int print_metric_line(const struct metric *m) { + size_t i; + + /* name */ + printf("%s", m->name); + + /* tags */ + for (i = 0; i < m->nr_tags; i++) + printf(",%s=%s", m->tags[i].key, m->tags[i].value); + + /* fields */ + for (i = 0; i < m->nr_fields; i++) + printf("%s%s=%lu", i ? "," : " ", m->fields[i].key, m->fields[i].value); + + /* timestamp */ + printf(" %lu\n", m->ts); + + return 0; +} + +typedef int (*metric_print_fn_t)(const struct metric *m); + +static metric_print_fn_t print_functions[] = { + print_metric_line, /* FORMAT_LINE_PROTOCOL */ +}; + +int print_metric(const struct metric *m, enum output_format fmt) { + if (fmt >= FORMAT_MAX) + return -EINVAL; + + return print_functions[fmt - 1](m); +} diff --git a/libbpf-tools/trace_helpers.h b/libbpf-tools/trace_helpers.h index dd215e3c54bf..ebf7a49f1fbe 100644 --- a/libbpf-tools/trace_helpers.h +++ b/libbpf-tools/trace_helpers.h @@ -5,6 +5,9 @@ #include #define NSEC_PER_SEC 1000000000ULL +#define MAX_TAGS 10 +#define MAX_FIELDS 10 +#define MAX_NAME_LEN 32 struct ksym { const char *name; @@ -121,4 +124,30 @@ int str_to_int(const char *src, void *dest); /* converts a string to a long integer */ int str_to_long(const char *src, void *dest); +enum output_format { + FORMAT_LINE_PROTOCOL = 1, // InfluxDB Line Protocol + FORMAT_MAX, +}; + +struct tag { + const char *key; + char value[MAX_NAME_LEN]; +}; + +struct field { + const char *key; + unsigned long value; +}; + +struct metric { + const char *name; + struct tag tags[MAX_TAGS]; + size_t nr_tags; + struct field fields[MAX_FIELDS]; + size_t nr_fields; + time_t ts; +}; + +int print_metric(const struct metric *m, enum output_format fmt); + #endif /* __TRACE_HELPERS_H */ diff --git a/tools/biotop_example.txt b/tools/biotop_example.txt index 62e8f1c70302..852c65fea0fe 100644 --- a/tools/biotop_example.txt +++ b/tools/biotop_example.txt @@ -163,6 +163,81 @@ This shows another "dd" command reading from xvda1. On this system, various creating and updating "status" files). +Output Metrics in Line Protocol +------------------------------- + +The `--output` option enables `biotop` to output metrics in InfluxDB Line +Protocol format, a text-based format for writing data points to time-series +databases like InfluxDB. This is useful for tracking I/O operations, kilobytes +transferred, and average latency per process ID (PID) for monitoring systems. + +Usage +----- +Use `--output` to switch to Line Protocol output. + +Example Output +-------------- +Running `biotop --output` might produce: +biotop,pid=279 I/O=6,Kbytes=69632,AVGms=0 1745473912 +biotop,pid=0 I/O=9,Kbytes=0,AVGms=94 1745473912 +biotop,pid=1585 I/O=14,Kbytes=94208,AVGms=1 1745473912 + +This format is easy to save to files or process with scripts and databases to +monitor I/O activity over time. + +Using the Output with InfluxDB and Grafana +------------------------------------------ + +The Line Protocol output can be stored in InfluxDB and visualized in Grafana, +similar to how folded stacks from `profile` are turned into flame graphs. +Here's how: + +1. Storing in InfluxDB +Pipe the output into InfluxDB using the `influx` CLI. For example: + +# biotop --output > biotop_metrics.txt +influx write -b mybucket -f biotop_metrics.txt -p s + +For real-time ingestion: +# unbuffer biotop --output | influx write -b mybucket -p s + +Note: The -p s option specifies second-precision timestamps. + +2. Visualizing Metrics +Once stored in InfluxDB, query the `biotop` measurement in Grafana to create +time-series graphs. Here are some examples: + +2.1) I/O Operations per PID +Command: `biotop --output` +Shows I/O operations per process ID as a time-series graph, useful for finding +processes with high I/O activity. +See: bcc/tools/images/biotop_io.png +Query: +from(bucket: "mybucket") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => r._measurement == "biotop" and r._field == "I/O") + +2.2) Latency per PID +Command: `biotop --output` +Shows average I/O latency per process ID in milliseconds, helping identify +processes with high I/O delays. +See: bcc/tools/images/biotop_latency.png +Query: +from(bucket: "mybucket") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => r._measurement == "biotop" and r._field == "AVGms") + +2.3) Combined I/O Metrics Dashboard +Command: `biotop --output` +Displays I/O operations, kilobytes transferred, and average latency per process +ID in a single Grafana dashboard, providing a comprehensive view of I/O activity +for real-time monitoring and trend analysis. +See: bcc/tools/images/biotops.png + +These graphs can be added to a Grafana dashboard to monitor I/O activity in +real-time or analyze trends. + + USAGE message: # ./biotop.py -h diff --git a/tools/images/biotop_io.png b/tools/images/biotop_io.png new file mode 100644 index 000000000000..54f28a22f757 Binary files /dev/null and b/tools/images/biotop_io.png differ diff --git a/tools/images/biotop_latency.png b/tools/images/biotop_latency.png new file mode 100644 index 000000000000..f3a0a5cdd0a5 Binary files /dev/null and b/tools/images/biotop_latency.png differ diff --git a/tools/images/biotops.png b/tools/images/biotops.png new file mode 100644 index 000000000000..24eb29f0b3d3 Binary files /dev/null and b/tools/images/biotops.png differ diff --git a/tools/images/memleak_count.png b/tools/images/memleak_count.png new file mode 100644 index 000000000000..e487c628c2a4 Binary files /dev/null and b/tools/images/memleak_count.png differ diff --git a/tools/images/memleak_kernel.png b/tools/images/memleak_kernel.png new file mode 100644 index 000000000000..6a4fe5ee8fbc Binary files /dev/null and b/tools/images/memleak_kernel.png differ diff --git a/tools/images/memleak_size.png b/tools/images/memleak_size.png new file mode 100644 index 000000000000..90f92187600e Binary files /dev/null and b/tools/images/memleak_size.png differ diff --git a/tools/images/syscount_call.png b/tools/images/syscount_call.png new file mode 100644 index 000000000000..61276ec59408 Binary files /dev/null and b/tools/images/syscount_call.png differ diff --git a/tools/images/syscount_latency.png b/tools/images/syscount_latency.png new file mode 100644 index 000000000000..6b55784f8864 Binary files /dev/null and b/tools/images/syscount_latency.png differ diff --git a/tools/images/syscount_process.png b/tools/images/syscount_process.png new file mode 100644 index 000000000000..28be8ca4ec95 Binary files /dev/null and b/tools/images/syscount_process.png differ diff --git a/tools/images/syscounts.png b/tools/images/syscounts.png new file mode 100644 index 000000000000..7a1b4df81c33 Binary files /dev/null and b/tools/images/syscounts.png differ diff --git a/tools/memleak_example.txt b/tools/memleak_example.txt index 4d4a2665dfc6..5a6da98a983f 100644 --- a/tools/memleak_example.txt +++ b/tools/memleak_example.txt @@ -29,7 +29,7 @@ inspect each allocation individually -- you get a nice summary of which stack is responsible for a large leak. Occasionally, you do want the individual allocation details. Perhaps the same -stack is allocating various sizes and you want to confirm which sizes are +stack is allocating various sizes and you want to confirm which sizes are prevalent. Use the -a switch: # ./memleak -p $(pidof allocs) -a @@ -109,18 +109,18 @@ to reduce the memory overhead. To avoid false positives, allocations younger than a certain age (500ms by default) are not printed. To change this threshold, use the -o switch. -By default, memleak prints its output every 5 seconds. To change this -interval, pass the interval as a positional parameter to memleak. You can +By default, memleak prints its output every 5 seconds. To change this +interval, pass the interval as a positional parameter to memleak. You can also control the number of times the output will be printed before exiting. For example: # ./memleak 1 10 ... will print the outstanding allocation statistics every second, for ten -times, and then exit. +times, and then exit. memleak may introduce considerable overhead if your application or kernel is -allocating and freeing memory at a very high rate. In that case, you can +allocating and freeing memory at a very high rate. In that case, you can control the overhead by sampling every N-th allocation. For example, to sample roughly 10% of the allocations and print the outstanding allocations every 5 seconds, 3 times before quitting: @@ -142,9 +142,9 @@ Attaching to pid 2614, Ctrl+C to quit. main+0x6d [allocs] __libc_start_main+0xf0 [libc-2.21.so] -Note that even though the application leaks 16 bytes of memory every second, +Note that even though the application leaks 16 bytes of memory every second, the report (printed every 5 seconds) doesn't "see" all the allocations because -of the sampling rate applied. +of the sampling rate applied. Profiling in memory part is hard to be accurate because of BPF infrastructure. memleak keeps misjudging memory leak on the complicated environment which has @@ -177,10 +177,90 @@ Attaching to pid 2623, Ctrl+C to quit. 0x0000559b478700b7 main+0x4a7 [redis-server] 0x00007fdf47029d90 __libc_start_call_main+0x80 [libc.so.6] -When using the --symbols-prefix argument, memleak can trace the third-party memory +When using the --symbols-prefix argument, memleak can trace the third-party memory allocations, such as jemalloc whose symbols are usually identified by the "je_" prefix in redis project. + +Output Metrics in Line Protocol +------------------------------- + +The `--output` option enables `memleak` to output metrics in InfluxDB Line +Protocol format, a text-based format for writing data points to time-series +databases like InfluxDB. This is useful for tracking outstanding allocations and +piping them into monitoring systems for analysis. + +Usage +----- +Use `--output` to switch to Line Protocol output. Combine with `` +for periodic metrics, or other options to adjust output. + +Example Output +-------------- +Running `memleak --output 1` might produce: +memleak,stackid=14613 size=22753280,count=5555 1742641470 +memleak,stackid=12929 size=40960,count=10 1742641470 +memleak,stackid=15137 size=16384,count=2 1742641470 + + +Using the Output with InfluxDB and Grafana +------------------------------------------ + +The Line Protocol output can be stored in InfluxDB and visualized in Grafana, +similar to how folded stacks from `profile` are turned into flame graphs. +Here's how: + +1. Storing in InfluxDB +Pipe the output into InfluxDB using the `influx` CLI. For example: + +# memleak --output 1 > memleak_metrics.txt +$ influx write -b mybucket -f memleak_metrics.txt -p s + +For real-time ingestion: +# unbuffer memleak --output 1 | influx write -b mybucket -p s + +Note: The -p s option specifies second-precision timestamps. Saving metrics +files is handy for debugging or reprocessing. + +2. Visualizing Metrics +Once stored in InfluxDB, query the `memleak` measurement to create graphs in +Grafana or the InfluxDB Web UI(no Grafana needed). Here are two visualization +examples: + +2.1) Outstanding Allocation Size +Command: `memleak --output 1` +Shows outstanding allocation sizes per stack ID as a time-series graph, useful +for spotting large memory leaks. +See: images/memleak_size.png +Query: +from(bucket: "mybucket") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => r._measurement == "memleak" and r._field == "size") + +2.2) Outstanding Allocation Count +Command: `memleak --output 1` +Displays the number of outstanding allocations per stack ID, helping identify +frequent memory leaks. +See: images/memleak_count.png +Grafana Query: +from(bucket: "mybucket") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => r._measurement == "memleak" and r._field == "count") + +2.3) Combined Memory Leak Metrics Dashboard +Displays both outstanding allocation size and count in a single Grafana dashboard, +offering a comprehensive overview of memory leak activity for real-time +monitoring and trend analysis. +See: images/memleak_kernel.png + +This visualization combines the allocation size and count metrics per stack ID, +enabling efficient identification of both large and frequent memory leaks in a +unified view. + +These graphs can be added to a Grafana dashboard or viewed in the InfluxDB UI +to monitor outstanding allocations in real-time or analyze historical trends. + + USAGE message: # ./memleak -h diff --git a/tools/syscount_example.txt b/tools/syscount_example.txt index 7e4ccaa6ce00..57714893503b 100644 --- a/tools/syscount_example.txt +++ b/tools/syscount_example.txt @@ -154,6 +154,97 @@ SYSCALL COUNT stat 316 ^C + +Output Metrics in Line Protocol +------------------------------- + +The `--output` option enables `syscount` to output metrics in InfluxDB Line +Protocol format, a text-based format for writing data points into time-series +databases like InfluxDB. This is useful for piping syscall counts and latencies +into monitoring systems for visualization and analysis. + +Usage +----- +Use `--output` (or `--output=line`) to switch to Line Protocol output. Combine +with options like `-P` (group by process), `-L` (include latency), and `-i +` for periodic metrics. + +Example Output +-------------- +Running `syscount --output -i 1` might produce: +syscount,syscall=read count=45 1744253477 +syscount,syscall=write count=23 1744253477 +syscount,syscall=openat count=12 1744253477 + +With `-P` (group by process): +syscount,pid=1234 count=45 1744253477 +syscount,pid=5678 count=19 1744253477 + +With `-L` (include latency): +syscount,syscall=read count=45,latency=70217 1744253477 +syscount,syscall=write count=23,latency=45123 1744253477 + + +Using the Output with InfluxDB and Grafana +------------------------------------------ + +The Line Protocol output can be stored in InfluxDB and visualized in Grafana, +similar to how folded stacks from `profile` are turned into flame graphs. +Here's how: + +1. Storing in InfluxDB +Pipe the output into InfluxDB using the `influx` CLI. For example: + +# syscount --output -i 1 > syscount_metrics.txt +$ influx write -b mybucket -f syscount_metrics.txt -p s + +For real-time ingestion: +# unbuffer syscount --output -i 1 | influx write -b mybucket -p s + +Note: The -p s option specifies second-precision timestamps. + +2. Visualizing in Grafana +Once stored in InfluxDB, query the `syscount` measurement in Grafana to create +time-series graphs. Here are some examples: + +1) Basic Mode +Command: `syscount --output -i 1` +Shows syscall counts over time as a line graph. +See: images/syscount_call.png +Grafana Query: +from(bucket: "mybucket") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => r._measurement == "syscount") + +2) Process Mode +Command: `syscount --output -P -i 1` +Displays syscall counts grouped by process ID, useful for identifying +per-process activity. +See: images/syscount_process.png +Grafana Query: +from(bucket: "mybucket") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => r._measurement == "syscount") + +3) Latency Mode +Command: `syscount --output -L -i 1` +Plots syscall counts and latencies together, highlighting performance +bottlenecks. +See: images/syscount_latency.png +Grafana Query: +from(bucket: "mybucket") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => r._measurement == "syscount" and r._field == "latency") + +4) Combined Syscall Metrics Dashboard +Combines Basic Mode and Latency Mode in a single Grafana dashboard for a +comprehensive view of syscall activity and performance. +See: images/syscounts.png + +These graphs can be added to a Grafana dashboard to monitor syscall activity in +real-time or analyze historical trends. + + USAGE: # syscount -h usage: syscount.py [-h] [-p PID] [-t TID] [-i INTERVAL] [-d DURATION] [-T TOP]