Update go dependencies

This commit is contained in:
Manuel Alejandro de Brito Fontes 2018-12-05 13:27:09 -03:00
parent 432f534383
commit f4a4daed84
1299 changed files with 71186 additions and 91183 deletions

View file

@ -1,4 +1,5 @@
.*.sw?
process-exporter
.tarballs
process-exporter-*.tar.gz
load-generator
integration-tester
dist

View file

@ -0,0 +1,39 @@
builds:
- main: cmd/process-exporter/main.go
binary: process-exporter
flags: -tags netgo
goos:
- linux
goarch:
- amd64
- 386
- arm
- arm64
- ppc64
- ppc64le
archive:
name_template: "process-exporter-{{ .Version }}.{{ .Os }}-{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
wrap_in_directory: true
nfpm:
homepage: https://github.com/ncabatoff/process-exporter
maintainer: nick.cabatoff+procexp@gmail.com
description: Prometheus exporter to report on processes running
license: MIT
formats:
- deb
- rpm
bindir: /usr/bin
files:
"packaging/process-exporter.service": "/lib/systemd/system/process-exporter.service"
config_files:
"packaging/conf/all.yaml": "/etc/process-exporter/all.yaml"
scripts:
postinstall: "packaging/scripts/postinstall.sh"
postremove: "packaging/scripts/postremove.sh"
preremove: "packaging/scripts/preremove.sh"
release:
github:
owner: ncabatoff
name: process-exporter
draft: false
prerelease: true

View file

@ -1,35 +0,0 @@
repository:
path: github.com/ncabatoff/process-exporter
build:
binaries:
- name: process-exporter
path: ./cmd/process-exporter
flags: -a -tags netgo
tarball:
files:
- LICENSE
crossbuild:
platforms:
- linux/amd64
- linux/386
- darwin/amd64
- darwin/386
- freebsd/amd64
- freebsd/386
- openbsd/amd64
- openbsd/386
- netbsd/amd64
- netbsd/386
- dragonfly/amd64
- linux/arm
- linux/arm64
- freebsd/arm
# Temporarily deactivated as golang.org/x/sys does not have syscalls
# implemented for that os/platform combination.
#- openbsd/arm
#- linux/mips64
#- linux/mips64le
- netbsd/arm
- linux/ppc64
- linux/ppc64le

View file

@ -0,0 +1,29 @@
services:
- docker
language: go
env:
- IMAGE_TAG=`echo $TRAVIS_TAG|sed s/v//`
go:
- 1.10.x
before_install:
- sudo apt-get -qq update
- sudo apt-get install -y rpm
go_import_path: github.com/ncabatoff/process-exporter
script:
- make style vet test build smoke docker
- if [ -n "$IMAGE_TAG" ]; then make docker DOCKER_IMAGE_TAG=$IMAGE_TAG; fi
after_success:
- docker login -u $DOCKER_USER -p "$DOCKER_PASSWORD"
- >
test -n "$TRAVIS_TAG" &&
docker tag ncabatoff/process-exporter:$IMAGE_TAG ncabatoff/process-exporter:latest &&
docker push ncabatoff/process-exporter:$IMAGE_TAG &&
docker push ncabatoff/process-exporter:latest &&
curl -sL http://git.io/goreleaser | bash

View file

@ -1,17 +1,21 @@
# Start from a Debian image with the latest version of Go installed
# and a workspace (GOPATH) configured at /go.
FROM golang
# Copy the local package files to the container's workspace.
ADD . /go/src/github.com/ncabatoff/process-exporter
FROM golang:1.10 AS build
#RUN curl -L -s https://github.com/golang/dep/releases/download/v0.5.0/dep-linux-amd64 -o $GOPATH/bin/dep
#RUN chmod +x $GOPATH/bin/dep
WORKDIR /go/src/github.com/ncabatoff/process-exporter
ADD . .
#RUN dep ensure
# Build the process-exporter command inside the container.
RUN make -C /go/src/github.com/ncabatoff/process-exporter
RUN make
USER root
FROM scratch
COPY --from=build /go/src/github.com/ncabatoff/process-exporter/process-exporter /bin/process-exporter
# Run the process-exporter command by default when the container starts.
ENTRYPOINT ["/go/src/github.com/ncabatoff/process-exporter/process-exporter"]
ENTRYPOINT ["/bin/process-exporter"]
# Document that the service listens on port 9256.
EXPOSE 9256

View file

@ -0,0 +1,4 @@
FROM scratch
COPY gopath/bin/process-exporter /process-exporter
ENTRYPOINT ["/process-exporter"]
EXPOSE 9256

View file

@ -3,73 +3,156 @@
[[projects]]
branch = "master"
digest = "1:d6afaeed1502aa28e80a4ed0981d570ad91b2579193404256ce672ed0a609e0d"
name = "github.com/beorn7/perks"
packages = ["quantile"]
revision = "4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9"
pruneopts = "UT"
revision = "3a771d992973f24aa725d07868b467d1ddfceafb"
[[projects]]
branch = "master"
digest = "1:15042ad3498153684d09f393bbaec6b216c8eec6d61f63dff711de7d64ed8861"
name = "github.com/golang/protobuf"
packages = ["proto"]
revision = "17ce1425424ab154092bbb43af630bd647f3bb0d"
pruneopts = "UT"
revision = "b4deda0973fb4c70b50d226b1af49f3da59f5265"
version = "v1.1.0"
[[projects]]
branch = "master"
name = "github.com/kylelemons/godebug"
packages = ["diff","pretty"]
revision = "d65d576e9348f5982d7f6d83682b694e731a45c6"
digest = "1:d2754cafcab0d22c13541618a8029a70a8959eb3525ff201fe971637e2274cd0"
name = "github.com/google/go-cmp"
packages = [
"cmp",
"cmp/cmpopts",
"cmp/internal/diff",
"cmp/internal/function",
"cmp/internal/value",
]
pruneopts = "UT"
revision = "3af367b6b30c263d47e8895973edcca9a49cf029"
version = "v0.2.0"
[[projects]]
digest = "1:ca955a9cd5b50b0f43d2cc3aeb35c951473eeca41b34eb67507f1dbcc0542394"
name = "github.com/kr/pretty"
packages = ["."]
pruneopts = "UT"
revision = "73f6ac0b30a98e433b289500d779f50c1a6f0712"
version = "v0.1.0"
[[projects]]
digest = "1:15b5cc79aad436d47019f814fde81a10221c740dc8ddf769221a65097fb6c2e9"
name = "github.com/kr/text"
packages = ["."]
pruneopts = "UT"
revision = "e2ffdb16a802fe2bb95e2e35ff34f0e53aeef34f"
version = "v0.1.0"
[[projects]]
digest = "1:ff5ebae34cfbf047d505ee150de27e60570e8c394b3b8fdbb720ff6ac71985fc"
name = "github.com/matttproud/golang_protobuf_extensions"
packages = ["pbutil"]
revision = "3247c84500bff8d9fb6d579d800f20b3e091582c"
version = "v1.0.0"
pruneopts = "UT"
revision = "c12348ce28de40eed0136aa2b644d0ee0650e56c"
version = "v1.0.1"
[[projects]]
branch = "master"
digest = "1:71520363c3acc43c35a2a53f79f6c61f110a026326c8b16dbdd351164765feac"
name = "github.com/ncabatoff/fakescraper"
packages = ["."]
pruneopts = "UT"
revision = "15938421d91a82d197de7fc59aebcac65c43407d"
[[projects]]
branch = "master"
digest = "1:9e33629d4ec9e9344715a54fa0a107f23ce800deb13999b0190df04c3540ccb5"
name = "github.com/ncabatoff/go-seq"
packages = ["seq"]
pruneopts = "UT"
revision = "b08ef85ed83364cba413c98a94bbd4169a0ce70b"
[[projects]]
branch = "add-proc-status"
digest = "1:df5079557e0fa0fe9fb973f84fffd52e32ef26ada655900fdeea9b0848766c74"
name = "github.com/ncabatoff/procfs"
packages = [
".",
"internal/util",
"nfs",
"xfs",
]
pruneopts = "UT"
revision = "e1a38cb53622f65e073c5e750e6498a44ebfbd2a"
[[projects]]
digest = "1:b6221ec0f8903b556e127c449e7106b63e6867170c2d10a7c058623d086f2081"
name = "github.com/prometheus/client_golang"
packages = ["prometheus"]
pruneopts = "UT"
revision = "c5b7fccd204277076155f10851dad72b76a49317"
version = "v0.8.0"
[[projects]]
branch = "master"
digest = "1:2d5cd61daa5565187e1d96bae64dbbc6080dacf741448e9629c64fd93203b0d4"
name = "github.com/prometheus/client_model"
packages = ["go"]
revision = "6f3806018612930941127f2a7c6c453ba2c527d2"
pruneopts = "UT"
revision = "5c3871d89910bfb32f5fcab2aa4b9ec68e65a99f"
[[projects]]
branch = "master"
digest = "1:63b68062b8968092eb86bedc4e68894bd096ea6b24920faca8b9dcf451f54bb5"
name = "github.com/prometheus/common"
packages = ["expfmt","internal/bitbucket.org/ww/goautoneg","model"]
revision = "2f17f4a9d485bf34b4bfaccc273805040e4f86c8"
packages = [
"expfmt",
"internal/bitbucket.org/ww/goautoneg",
"model",
]
pruneopts = "UT"
revision = "c7de2306084e37d54b8be01f3541a8464345e9a5"
[[projects]]
branch = "master"
digest = "1:8c49953a1414305f2ff5465147ee576dd705487c35b15918fcd4efdc0cb7a290"
name = "github.com/prometheus/procfs"
packages = [".","xfs"]
revision = "e645f4e5aaa8506fc71d6edbc5c4ff02c04c46f2"
packages = [
".",
"internal/util",
"nfs",
"xfs",
]
pruneopts = "UT"
revision = "05ee40e3a273f7245e8777337fc7b46e533a9a92"
[[projects]]
branch = "v1"
digest = "1:af715ae33cc1f5695c4b2a4e4b21d008add8802a99e15bb467ac7c32edb5000d"
name = "gopkg.in/check.v1"
packages = ["."]
revision = "20d25e2804050c1cd24a7eea1e7a6447dd0e74ec"
pruneopts = "UT"
revision = "788fd78401277ebd861206a03c884797c6ec5541"
[[projects]]
branch = "v2"
digest = "1:342378ac4dcb378a5448dd723f0784ae519383532f5e70ade24132c4c8693202"
name = "gopkg.in/yaml.v2"
packages = ["."]
revision = "eb3733d160e74a9c7e442f435eb3bea458e1d19f"
pruneopts = "UT"
revision = "5420a8b6744d3b0345ab293f6fcba19c978f1183"
version = "v2.2.1"
[solve-meta]
analyzer-name = "dep"
analyzer-version = 1
inputs-digest = "abd920f891c3e5fe2ee27ce40acbdde66e0799704d160b01f22530df003adfe1"
input-imports = [
"github.com/google/go-cmp/cmp",
"github.com/google/go-cmp/cmp/cmpopts",
"github.com/ncabatoff/fakescraper",
"github.com/ncabatoff/go-seq/seq",
"github.com/ncabatoff/procfs",
"github.com/prometheus/client_golang/prometheus",
"gopkg.in/check.v1",
"gopkg.in/yaml.v2",
]
solver-name = "gps-cdcl"
solver-version = 1

View file

@ -1,4 +1,3 @@
# Gopkg.toml example
#
# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md
@ -17,30 +16,39 @@
# source = "github.com/myfork/project2"
#
# [[override]]
# name = "github.com/x/y"
# version = "2.4.0"
# name = "github.com/x/y"
# version = "2.4.0"
#
# [prune]
# non-go = false
# go-tests = true
# unused-packages = true
[[constraint]]
branch = "master"
name = "github.com/kylelemons/godebug"
name = "github.com/google/go-cmp"
version = "0.2.0"
[[constraint]]
branch = "master"
name = "github.com/ncabatoff/fakescraper"
[[constraint]]
name = "github.com/prometheus/client_golang"
version = "0.8.0"
branch = "add-proc-status"
name = "github.com/ncabatoff/procfs"
[[constraint]]
branch = "master"
name = "github.com/prometheus/procfs"
name = "github.com/prometheus/client_golang"
version = "0.8.0"
[[constraint]]
branch = "v1"
name = "gopkg.in/check.v1"
[[constraint]]
branch = "v2"
name = "gopkg.in/yaml.v2"
version = "2.2.1"
[prune]
go-tests = true
unused-packages = true

View file

@ -1,32 +1,12 @@
# Copyright 2015 The Prometheus Authors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
GO := GO15VENDOREXPERIMENT=1 go
FIRST_GOPATH := $(firstword $(subst :, ,$(shell $(GO) env GOPATH)))
PROMU := $(FIRST_GOPATH)/bin/promu
pkgs = $(shell $(GO) list ./... | grep -v /vendor/)
pkgs = $(shell go list ./... | grep -v /vendor/)
PREFIX ?= $(shell pwd)
BIN_DIR ?= $(shell pwd)
DOCKER_IMAGE_NAME ?= process-exporter
DOCKER_IMAGE_NAME ?= ncabatoff/process-exporter
DOCKER_IMAGE_TAG ?= $(subst /,-,$(shell git rev-parse --abbrev-ref HEAD))
SMOKE_TEST = -config.path packaging/conf/all.yaml -once-to-stdout-delay 1s |grep -q 'namedprocess_namegroup_memory_bytes{groupname="process-exporte",memtype="virtual"}'
ifdef DEBUG
bindata_flags = -debug
endif
all: format vet build test
all: format vet test build smoke
style:
@echo ">> checking code style"
@ -34,38 +14,37 @@ style:
test:
@echo ">> running short tests"
@$(GO) test -short $(pkgs)
go test -short $(pkgs)
format:
@echo ">> formatting code"
@$(GO) fmt $(pkgs)
go fmt $(pkgs)
vet:
@echo ">> vetting code"
@$(GO) vet $(pkgs)
go vet $(pkgs)
build: promu
@echo ">> building binaries"
@$(PROMU) build --prefix $(PREFIX)
build:
@echo ">> building code"
cd cmd/process-exporter; CGO_ENABLED=0 go build -o ../../process-exporter -a -tags netgo
tarball: promu
@echo ">> building release tarball"
@$(PROMU) tarball --prefix $(PREFIX) $(BIN_DIR)
smoke:
@echo ">> smoke testing process-exporter"
./process-exporter $(SMOKE_TEST)
crossbuild: promu
@echo ">> cross-building"
@$(PROMU) crossbuild
@$(PROMU) crossbuild tarballs
integ:
@echo ">> integration testing process-exporter"
go build -o integration-tester cmd/integration-tester/main.go
go build -o load-generator cmd/load-generator/main.go
./integration-tester -write-size-bytes 65536
install:
@echo ">> installing binary"
cd cmd/process-exporter; CGO_ENABLED=0 go install -a -tags netgo
docker:
@echo ">> building docker image"
@docker build -t "$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" .
docker build -t "$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" .
docker run --rm -v `pwd`/packaging:/packaging "$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" $(SMOKE_TEST)
promu:
@echo ">> fetching promu"
@GOOS=$(shell uname -s | tr A-Z a-z) \
GOARCH=$(subst x86_64,amd64,$(patsubst i%86,386,$(patsubst arm%,arm,$(shell uname -m)))) \
$(GO) get -u github.com/prometheus/promu
.PHONY: all style format build test vet tarball crossbuild docker promu
.PHONY: all style format test vet build integ docker

View file

@ -1,64 +1,111 @@
# process-exporter
Prometheus exporter that mines /proc to report on selected processes.
The premise for this exporter is that sometimes you have apps that are
impractical to instrument directly, either because you don't control the code
or they're written in a language that isn't easy to instrument with Prometheus.
A fair bit of information can be gleaned from /proc, especially for
long-running programs.
[release]: https://github.com/ncabatoff/process-exporter/releases/latest
For most systems it won't be beneficial to create metrics for every process by
name: there are just too many of them and most don't do enough to merit it.
Various command-line options are provided to control how processes are grouped
and the groups are named. Run "process-exporter -man" to see a help page
giving details.
[![Release](https://img.shields.io/github/release/ncabatoff/process-exporter.svg?style=flat-square")][release]
[![Build Status](https://travis-ci.org/ncabatoff/process-exporter.svg?branch=master)](https://travis-ci.org/ncabatoff/process-exporter)
[![Powered By: GoReleaser](https://img.shields.io/badge/powered%20by-goreleaser-green.svg?branch=master)](https://github.com/goreleaser)
Metrics available currently include CPU usage, bytes written and read, and
number of processes in each group.
Some apps are impractical to instrument directly, either because you
don't control the code or they're written in a language that isn't easy to
instrument with Prometheus. We must instead resort to mining /proc.
Bytes read and written come from /proc/[pid]/io in recent enough kernels.
These correspond to the fields `read_bytes` and `write_bytes` respectively.
These IO stats come with plenty of caveats, see either the Linux kernel
documentation or man 5 proc.
## Installation
CPU usage comes from /proc/[pid]/stat fields utime (user time) and stime (system
time.) It has been translated into fractional seconds of CPU consumed. Since
it is a counter, using rate() will tell you how many fractional cores were running
code from this process during the interval given.
Either grab a package for your OS from the [Releases][release] page, or
install via [docker](https://hub.docker.com/r/ncabatoff/process-exporter/).
An example Grafana dashboard to view the metrics is available at https://grafana.net/dashboards/249
## Running
## Instrumentation cost
Usage:
process-exporter will consume CPU in proportion to the number of processes in
the system and the rate at which new ones are created. The most expensive
parts - applying regexps and executing templates - are only applied once per
process seen. If you have mostly long-running processes process-exporter
should be lightweight: each time a scrape occurs, parsing of /proc/$pid/stat
and /proc/$pid/cmdline for every process being monitored and adding a few
numbers.
```
process-exporter [options] -config.path filename.yml
```
## Config
or via docker:
```
docker run -d --rm -p 9256:9256 --privileged -v /proc:/host/proc -v `pwd`:/config ncabatoff/process-exporter --procfs /host/proc -config.path /config/filename.yml
```
Important options (run process-exporter --help for full list):
-children (default:true) makes it so that any process that otherwise
isn't part of its own group becomes part of the first group found (if any) when
walking the process tree upwards. In other words, resource usage of
subprocesses is added to their parent's usage unless the subprocess identifies
as a different group name.
-recheck (default:false) means that on each scrape the process names are
re-evaluated. This is disabled by default as an optimization, but since
processes can choose to change their names, this may result in a process
falling into the wrong group if we happen to see it for the first time before
it's assumed its proper name.
-procnames is intended as a quick alternative to using a config file. Details
in the following section.
## Configuration and group naming
To select and group the processes to monitor, either provide command-line
arguments or use a YAML configuration file.
To avoid confusion with the cmdline YAML element, we'll refer to the
null-delimited contents of `/proc/<pid>/cmdline` as the array `argv[]`.
The recommended option is to use a config file via -config.path, but for
convenience and backwards compatability the -procnames/-namemapping options
exist as an alternative.
### Using a config file
The general format of the -config.path YAML file is a top-level
`process_names` section, containing a list of name matchers:
```
process_names:
- matcher1
- matcher2
...
- matcherN
```
The default config shipped with the deb/rpm packages is:
```
process_names:
- name: "{{.Comm}}"
cmdline:
- '.+'
```
A process may only belong to one group: even if multiple items would match, the
first one listed in the file wins.
(Side note: to avoid confusion with the cmdline YAML element, we'll refer to
the command-line arguments of a process `/proc/<pid>/cmdline` as the array
`argv[]`.)
#### Using a config file: group name
Each item in `process_names` gives a recipe for identifying and naming
processes. The optional `name` tag defines a template to use to name
matching processes; if not specified, `name` defaults to `{{.ExeBase}}`.
Template variables available:
- `{{.Comm}}` contains the basename of the original executable, i.e. 2nd field in `/proc/<pid>/stat`
- `{{.ExeBase}}` contains the basename of the executable
- `{{.ExeFull}}` contains the fully qualified path of the executable
- `{{.Username}}` contains the username of the effective user
- `{{.Matches}}` map contains all the matches resulting from applying cmdline regexps
#### Using a config file: process selectors
Each item in `process_names` must contain one or more selectors (`comm`, `exe`
or `cmdline`); if more than one selector is present, they must all match. Each
selector is a list of strings to match against a process's `comm`, `argv[0]`,
or in the case of `cmdline`, a regexp to apply to the command line.
or in the case of `cmdline`, a regexp to apply to the command line. The cmdline
regexp uses the [Go syntax](https://golang.org/pkg/regexp).
For `comm` and `exe`, the list of strings is an OR, meaning any process
matching any of the strings will be added to the item's group.
@ -67,10 +114,7 @@ For `cmdline`, the list of regexes is an AND, meaning they all must match. Any
capturing groups in a regexp must use the `?P<name>` option to assign a name to
the capture, which is used to populate `.Matches`.
A process may only belong to one group: even if multiple items would match, the
first one listed in the file wins.
Other performance tips: give an exe or comm clause in addition to any cmdline
Performance tip: give an exe or comm clause in addition to any cmdline
clause, so you avoid executing the regexp when the executable name doesn't
match.
@ -95,8 +139,7 @@ process_names:
exe:
- /usr/local/bin/process-exporter
cmdline:
- -config.path\\s+(?P<Cfgfile>\\S+)
- -config.path\s+(?P<Cfgfile>\S+)
```
@ -118,43 +161,195 @@ process_names:
```
## Docker
### Using -procnames/-namemapping instead of config.path
A docker image can be created with
Every name in the procnames list becomes a process group. The default name of
a process is the value found in the second field of /proc/<pid>/stat
("comm"), which is truncated at 15 chars. Usually this is the same as the
name of the executable.
If -namemapping isn't provided, every process with a comm value present
in -procnames is assigned to a group based on that name, and any other
processes are ignored.
The -namemapping option is a comma-separated list of alternating
name,regexp values. It allows assigning a name to a process based on a
combination of the process name and command line. For example, using
-namemapping "python2,([^/]+)\.py,java,-jar\s+([^/]+).jar"
will make it so that each different python2 and java -jar invocation will be
tracked with distinct metrics. Processes whose remapped name is absent from
the procnames list will be ignored. On a Ubuntu Xenian machine being used as
a workstation, here's a good way of tracking resource usage for a few
different key user apps:
process-exporter -namemapping "upstart,(--user)" \
-procnames chromium-browse,bash,gvim,prometheus,process-exporter,upstart:-user
Since upstart --user is the parent process of the X11 session, this will
make all apps started by the user fall into the group named "upstart:-user",
unless they're one of the others named explicitly with -procnames, like gvim.
## Group Metrics
There's no meaningful way to name a process that will only ever name a single process, so process-exporter assumes that every metric will be attached
to a group of processes - not a
[process group](https://en.wikipedia.org/wiki/Process_group) in the technical
sense, just one or more processes that meet a configuration's specification
of what should be monitored and how to name it.
All these metrics start with `namedprocess_namegroup_` and have at minimum
the label `groupname`.
### num_procs gauge
Number of processes in this group.
### cpu_user_seconds_total counter
CPU usage based on /proc/[pid]/stat field utime(14) i.e. user time.
A value of 1 indicates that the processes in this group have been scheduled
in user mode for a total of 1 second on a single virtual CPU.
### cpu_system_seconds_total counter
CPU usage based on /proc/[pid]/stat field stime(15) i.e. system time.
### read_bytes_total counter
Bytes read based on /proc/[pid]/io field read_bytes. The man page
says
> Attempt to count the number of bytes which this process really did cause to be fetched from the storage layer. This is accurate for block-backed filesystems.
but I would take it with a grain of salt.
### write_bytes_total counter
Bytes written based on /proc/[pid]/io field write_bytes. As with
read_bytes, somewhat dubious. May be useful for isolating which processes
are doing the most I/O, but probably not measuring just how much I/O is happening.
### major_page_faults_total counter
Number of major page faults based on /proc/[pid]/stat field majflt(12).
### minor_page_faults_total counter
Number of minor page faults based on /proc/[pid]/stat field minflt(10).
### context_switches_total counter
Number of context switches based on /proc/[pid]/status fields voluntary_ctxt_switches
and nonvoluntary_ctxt_switches. The extra label `ctxswitchtype` can have two values:
`voluntary` and `nonvoluntary`.
### memory_bytes gauge
Number of bytes of memory used. The extra label `memtype` can have two values:
*resident*: Field rss(24) from /proc/[pid]/stat, whose doc says:
> This is just the pages which count toward text, data, or stack space. This does not include pages which have not been demand-loaded in, or which are swapped out.
*virtual*: Field vsize(23) from /proc/[pid]/stat, virtual memory size.
*swapped*: Field VmSwap from /proc/[pid]/status, translated from KB to bytes.
### open_filedesc gauge
Number of file descriptors, based on counting how many entries are in the directory
/proc/[pid]/fd.
### worst_fd_ratio gauge
Worst ratio of open filedescs to filedesc limit, amongst all the procs in the
group. The limit is the fd soft limit based on /proc/[pid]/limits.
Normally Prometheus metrics ought to be as "basic" as possible (i.e. the raw
values rather than a derived ratio), but we use a ratio here because nothing
else makes sense. Suppose there are 10 procs in a given group, each with a
soft limit of 4096, and one of them has 4000 open fds and the others all have
40, their total fdcount is 4360 and total soft limit is 40960, so the ratio
is 1:10, but in fact one of the procs is about to run out of fds. With
worst_fd_ratio we're able to know this: in the above example it would be
0.97, rather than the 0.10 you'd see if you computed sum(open_filedesc) /
sum(limit_filedesc).
### oldest_start_time_seconds gauge
Epoch time (seconds since 1970/1/1) at which the oldest process in the group
started. This is derived from field starttime(22) from /proc/[pid]/stat, added
to boot time to make it relative to epoch.
### num_threads gauge
Sum of number of threads of all process in the group. Based on field num_threads(20)
from /proc/[pid]/stat.
### states gauge
Number of threads in the group in each of various states, based on the field
state(3) from /proc/[pid]/stat.
The extra label `state` can have these values: `Running`, `Sleeping`, `Waiting`, `Zombie`, `Other`.
## Group Thread Metrics
All these metrics start with `namedprocess_namegroup_` and have at minimum
the labels `groupname` and `threadname`. `threadname` is field comm(2) from
/proc/[pid]/stat. Just as groupname breaks the set of processes down into
groups, threadname breaks a given process group down into subgroups.
### thread_count gauge
Number of threads in this thread subgroup.
### thread_cpu_seconds_total counter
Same as cpu_user_seconds_total and cpu_system_seconds_total, but broken down
per-thread subgroup. Unlike cpu_user_seconds_total/cpu_system_seconds_total,
the label `cpumode` is used to distinguish between `user` and `system` time.
### thread_io_bytes_total counter
Same as read_bytes_total and write_bytes_total, but broken down
per-thread subgroup. Unlike read_bytes_total/write_bytes_total,
the label `iomode` is used to distinguish between `read` and `write` bytes.
### thread_major_page_faults_total counter
Same as major_page_faults_total, but broken down per-thread subgroup.
### thread_minor_page_faults_total counter
Same as minor_page_faults_total, but broken down per-thread subgroup.
### thread_context_switches_total counter
Same as context_switches_total, but broken down per-thread subgroup.
## Instrumentation cost
process-exporter will consume CPU in proportion to the number of processes in
the system and the rate at which new ones are created. The most expensive
parts - applying regexps and executing templates - are only applied once per
process seen, unless the command-line option -recheck is provided.
If you have mostly long-running processes process-exporter overhead should be
minimal: each time a scrape occurs, it will parse of /proc/$pid/stat and
/proc/$pid/cmdline for every process being monitored and add a few numbers.
## Dashboards
An example Grafana dashboard to view the metrics is available at https://grafana.net/dashboards/249
## Building
Install [dep](https://github.com/golang/dep), then:
```
make docker
dep ensure
make
```
Then run the docker, e.g.
```
docker run --privileged --name pexporter -d -v /proc:/host/proc -p 127.0.0.1:9256:9256 process-exporter:master -procfs /host/proc -procnames chromium-browse,bash,prometheus,gvim,upstart:-user -namemapping "upstart,(-user)"
```
This will expose metrics on http://localhost:9256/metrics. Leave off the
`127.0.0.1:` to publish on all interfaces. Leave off the --priviliged and
add the --user docker run argument if you only need to monitor processes
belonging to a single user.
## History
An earlier version of this exporter had options to enable auto-discovery of
which processes were consuming resources. This functionality has been removed.
These options were based on a percentage of resource usage, e.g. if an
untracked process consumed X% of CPU during a scrape, start tracking processes
with that name. However during any given scrape it's likely that most
processes are idle, so we could add a process that consumes minimal resources
but which happened to be active during the interval preceding the current
scrape. Over time this means that a great many processes wind up being
scraped, which becomes unmanageable to visualize. This could be mitigated by
looking at resource usage over longer intervals, but ultimately I didn't feel
this feature was important enough to invest more time in at this point. It may
re-appear at some point in the future, but no promises.
Another lost feature: the "other" group was used to count usage by non-tracked
procs. This was useful to get an idea of what wasn't being monitored. But it
comes at a high cost: if you know what processes you care about, you're wasting
a lot of CPU to compute the usage of everything else that you don't care about.
The new approach is to minimize resources expended on non-tracked processes and
to require the user to whitelist the processes to track.

View file

@ -1 +0,0 @@
0.1.0

View file

@ -0,0 +1,49 @@
steps:
# - name: string
# args: string
# env: string
# dir: string
# id: string
# waitFor: string
# entrypoint: string
# secretEnv: string
# Setup the workspace
- name: gcr.io/cloud-builders/go
env: ['PROJECT_ROOT=github.com/ncabatoff/process-exporter']
args: ['env']
# Build project
- name: gcr.io/cloud-builders/docker
entrypoint: 'bash'
args: ['-c', 'docker build -t ncabatoff/process-exporter:`echo $TAG_NAME|sed s/^v//` .']
# Login to docker hub
- name: gcr.io/cloud-builders/docker
entrypoint: 'bash'
args: ['-c', 'docker login --username=ncabatoff --password=$$DOCKER_PASSWORD']
secretEnv: ['DOCKER_PASSWORD']
# Push to docker hub
- name: gcr.io/cloud-builders/docker
entrypoint: 'bash'
args: ['-c', 'docker push ncabatoff/process-exporter:`echo $TAG_NAME|sed s/^v//`']
# Create github release
- name: goreleaser/goreleaser
entrypoint: /bin/sh
dir: gopath/src/github.com
env: ['GOPATH=/workspace/gopath']
args: ['-c', 'cd ncabatoff/process-exporter && git tag $TAG_NAME && /goreleaser' ]
secretEnv: ['GITHUB_TOKEN']
secrets:
- kmsKeyName: projects/process-exporter/locations/global/keyRings/cloudbuild/cryptoKeys/mykey
secretEnv:
DOCKER_PASSWORD: |
CiQAeHUuEinm1h2j9mp8r0NjPw1l1bBwzDG+JHPUPf3GvtmdjXESMAD3wUauaxWrxid/zPunG67x
5+1CYedV5exh0XwQ32eu4UkniS7HHJNWBudklaG0JA==
GITHUB_TOKEN: |
CiQAeHUuEhEKAvfIHlUZrCgHNScm0mDKI8Z1w/N3OzDk8Ql6kAUSUQD3wUau7qRc+H7OnTUo6b2Z
DKA1eMKHNg729KfHj2ZMqZXinrJloYMbZcZRXP9xv91xCq6QJB5UoFoyYDnXGdvgXC08YUstR6UB
H0bwHhe1GQ==

View file

@ -0,0 +1,32 @@
steps:
# - name: string
# args: string
# env: string
# dir: string
# id: string
# waitFor: string
# entrypoint: string
# secretEnv: string
# - name: gcr.io/cloud-builders/curl
# args: ['-L', '-s', '-o', 'dep', 'https://github.com/golang/dep/releases/download/v0.5.0/dep-linux-amd64']
# - name: ubuntu
# args: ['chmod', '+x', 'dep']
# Setup the workspace
- name: gcr.io/cloud-builders/go
env: ['PROJECT_ROOT=github.com/ncabatoff/process-exporter']
args: ['env']
# Run dep in the workspace created in previous step
# - name: gcr.io/cloud-builders/go
# entrypoint: /bin/sh
# dir: gopath/src/github.com
# env: ['GOPATH=/workspace/gopath']
# args: ['-c', 'cd ncabatoff/process-exporter && /workspace/dep ensure -vendor-only' ]
- name: gcr.io/cloud-builders/go
entrypoint: /bin/sh
dir: gopath/src/github.com
env: ['GOPATH=/workspace/gopath']
args: ['-c', 'make -C ncabatoff/process-exporter style vet test build integ install' ]
- name: gcr.io/cloud-builders/docker
args: ['build', '--tag=gcr.io/$PROJECT_ID/process-exporter', '.', '-f', 'Dockerfile.cloudbuild']
images: ['gcr.io/$PROJECT_ID/process-exporter']

View file

@ -1,14 +1,18 @@
package common
import "fmt"
type (
NameAndCmdline struct {
Name string
Cmdline []string
ProcAttributes struct {
Name string
Cmdline []string
Username string
}
MatchNamer interface {
// MatchAndName returns false if the match failed, otherwise
// true and the resulting name.
MatchAndName(NameAndCmdline) (bool, string)
MatchAndName(ProcAttributes) (bool, string)
fmt.Stringer
}
)

View file

@ -1,173 +1,179 @@
package proc
import (
common "github.com/ncabatoff/process-exporter"
"time"
seq "github.com/ncabatoff/go-seq/seq"
common "github.com/ncabatoff/process-exporter"
)
type (
// Grouper is the top-level interface to the process metrics. All tracked
// procs sharing the same group name are aggregated.
Grouper struct {
namer common.MatchNamer
trackChildren bool
// track how much was seen last time so we can report the delta
GroupStats map[string]Counts
tracker *Tracker
// groupAccum records the historical accumulation of a group so that
// we can avoid ever decreasing the counts we return.
groupAccum map[string]Counts
tracker *Tracker
threadAccum map[string]map[string]Threads
debug bool
}
GroupCountMap map[string]GroupCounts
// GroupByName maps group name to group metrics.
GroupByName map[string]Group
GroupCounts struct {
// Threads collects metrics for threads in a group sharing a thread name.
Threads struct {
Name string
NumThreads int
Counts
Procs int
Memresident uint64
Memvirtual uint64
}
// Group describes the metrics of a single group.
Group struct {
Counts
States
Wchans map[string]int
Procs int
Memory
OldestStartTime time.Time
OpenFDs uint64
WorstFDratio float64
NumThreads uint64
Threads []Threads
}
)
func NewGrouper(trackChildren bool, namer common.MatchNamer) *Grouper {
// Returns true if x < y. Test designers should ensure they always have
// a unique name/numthreads combination for each group.
func lessThreads(x, y Threads) bool { return seq.Compare(x, y) < 0 }
// NewGrouper creates a grouper.
func NewGrouper(namer common.MatchNamer, trackChildren, alwaysRecheck, debug bool) *Grouper {
g := Grouper{
trackChildren: trackChildren,
namer: namer,
GroupStats: make(map[string]Counts),
tracker: NewTracker(),
groupAccum: make(map[string]Counts),
threadAccum: make(map[string]map[string]Threads),
tracker: NewTracker(namer, trackChildren, alwaysRecheck, debug),
debug: debug,
}
return &g
}
func (g *Grouper) checkAncestry(idinfo ProcIdInfo, newprocs map[ProcId]ProcIdInfo) string {
ppid := idinfo.ParentPid
pProcId := g.tracker.ProcIds[ppid]
if pProcId.Pid < 1 {
// Reached root of process tree without finding a tracked parent.
g.tracker.Ignore(idinfo.ProcId)
return ""
}
// Is the parent already known to the tracker?
if ptproc, ok := g.tracker.Tracked[pProcId]; ok {
if ptproc != nil {
// We've found a tracked parent.
g.tracker.Track(ptproc.GroupName, idinfo)
return ptproc.GroupName
} else {
// We've found an untracked parent.
g.tracker.Ignore(idinfo.ProcId)
return ""
}
}
// Is the parent another new process?
if pinfoid, ok := newprocs[pProcId]; ok {
if name := g.checkAncestry(pinfoid, newprocs); name != "" {
// We've found a tracked parent, which implies this entire lineage should be tracked.
g.tracker.Track(name, idinfo)
return name
}
}
// Parent is dead, i.e. we never saw it, or there's no tracked proc in our ancestry.
g.tracker.Ignore(idinfo.ProcId)
return ""
}
// Update tracks any new procs that should be according to policy, and updates
// the metrics for already tracked procs. Permission errors are returned as a
// count, and will not affect the error return value.
func (g *Grouper) Update(iter ProcIter) (int, error) {
newProcs, permErrs, err := g.tracker.Update(iter)
if err != nil {
return permErrs, err
}
// Step 1: track any new proc that should be tracked based on its name and cmdline.
untracked := make(map[ProcId]ProcIdInfo)
for _, idinfo := range newProcs {
wanted, gname := g.namer.MatchAndName(common.NameAndCmdline{Name: idinfo.Name, Cmdline: idinfo.Cmdline})
if !wanted {
untracked[idinfo.ProcId] = idinfo
continue
}
g.tracker.Track(gname, idinfo)
}
// Step 2: track any untracked new proc that should be tracked because its parent is tracked.
if !g.trackChildren {
return permErrs, nil
}
for _, idinfo := range untracked {
if _, ok := g.tracker.Tracked[idinfo.ProcId]; ok {
// Already tracked or ignored
continue
}
g.checkAncestry(idinfo, untracked)
}
return permErrs, nil
}
// groups returns the aggregate metrics for all groups tracked. This reflects
// solely what's currently running.
func (g *Grouper) groups() GroupCountMap {
gcounts := make(GroupCountMap)
func groupadd(grp Group, ts Update) Group {
var zeroTime time.Time
for _, tinfo := range g.tracker.Tracked {
if tinfo == nil {
continue
}
cur := gcounts[tinfo.GroupName]
cur.Procs++
tstats := tinfo.GetStats()
cur.Memresident += tstats.Memory.Resident
cur.Memvirtual += tstats.Memory.Virtual
cur.OpenFDs += tstats.Filedesc.Open
openratio := float64(tstats.Filedesc.Open) / float64(tstats.Filedesc.Limit)
if cur.WorstFDratio < openratio {
cur.WorstFDratio = openratio
}
cur.Counts.Cpu += tstats.latest.Cpu
cur.Counts.ReadBytes += tstats.latest.ReadBytes
cur.Counts.WriteBytes += tstats.latest.WriteBytes
if cur.OldestStartTime == zeroTime || tstats.start.Before(cur.OldestStartTime) {
cur.OldestStartTime = tstats.start
}
gcounts[tinfo.GroupName] = cur
grp.Procs++
grp.Memory.ResidentBytes += ts.Memory.ResidentBytes
grp.Memory.VirtualBytes += ts.Memory.VirtualBytes
grp.Memory.VmSwapBytes += ts.Memory.VmSwapBytes
if ts.Filedesc.Open != -1 {
grp.OpenFDs += uint64(ts.Filedesc.Open)
}
openratio := float64(ts.Filedesc.Open) / float64(ts.Filedesc.Limit)
if grp.WorstFDratio < openratio {
grp.WorstFDratio = openratio
}
grp.NumThreads += ts.NumThreads
grp.Counts.Add(ts.Latest)
grp.States.Add(ts.States)
if grp.OldestStartTime == zeroTime || ts.Start.Before(grp.OldestStartTime) {
grp.OldestStartTime = ts.Start
}
return gcounts
if grp.Wchans == nil {
grp.Wchans = make(map[string]int)
}
for wchan, count := range ts.Wchans {
grp.Wchans[wchan] += count
}
return grp
}
// Groups returns GroupCounts with Counts that never decrease in value from one
// call to the next. Even if processes exit, their CPU and IO contributions up
// to that point are included in the results. Even if no processes remain
// in a group it will still be included in the results.
func (g *Grouper) Groups() GroupCountMap {
groups := g.groups()
// Update asks the tracker to report on each tracked process by name.
// These are aggregated by groupname, augmented by accumulated counts
// from the past, and returned. Note that while the Tracker reports
// only what counts have changed since last cycle, Grouper.Update
// returns counts that never decrease. Even once the last process
// with name X disappears, name X will still appear in the results
// with the same counts as before; of course, all non-count metrics
// will be zero.
func (g *Grouper) Update(iter Iter) (CollectErrors, GroupByName, error) {
cerrs, tracked, err := g.tracker.Update(iter)
if err != nil {
return cerrs, nil, err
}
return cerrs, g.groups(tracked), nil
}
// First add any accumulated counts to what was just observed,
// Translate the updates into a new GroupByName and update internal history.
func (g *Grouper) groups(tracked []Update) GroupByName {
groups := make(GroupByName)
threadsByGroup := make(map[string][]ThreadUpdate)
for _, update := range tracked {
groups[update.GroupName] = groupadd(groups[update.GroupName], update)
if update.Threads != nil {
threadsByGroup[update.GroupName] =
append(threadsByGroup[update.GroupName], update.Threads...)
}
}
// Add any accumulated counts to what was just observed,
// and update the accumulators.
for gname, group := range groups {
if oldcounts, ok := g.GroupStats[gname]; ok {
group.Counts.Cpu += oldcounts.Cpu
group.Counts.ReadBytes += oldcounts.ReadBytes
group.Counts.WriteBytes += oldcounts.WriteBytes
if oldcounts, ok := g.groupAccum[gname]; ok {
group.Counts.Add(Delta(oldcounts))
}
g.GroupStats[gname] = group.Counts
g.groupAccum[gname] = group.Counts
group.Threads = g.threads(gname, threadsByGroup[gname])
groups[gname] = group
}
// Now add any groups that were observed in the past but aren't running now.
for gname, gcounts := range g.GroupStats {
for gname, gcounts := range g.groupAccum {
if _, ok := groups[gname]; !ok {
groups[gname] = GroupCounts{Counts: gcounts}
groups[gname] = Group{Counts: gcounts}
}
}
return groups
}
func (g *Grouper) threads(gname string, tracked []ThreadUpdate) []Threads {
if len(tracked) == 0 {
delete(g.threadAccum, gname)
return nil
}
ret := make([]Threads, 0, len(tracked))
threads := make(map[string]Threads)
// First aggregate the thread metrics by thread name.
for _, nc := range tracked {
curthr := threads[nc.ThreadName]
curthr.NumThreads++
curthr.Counts.Add(nc.Latest)
curthr.Name = nc.ThreadName
threads[nc.ThreadName] = curthr
}
// Add any accumulated counts to what was just observed,
// and update the accumulators.
if history := g.threadAccum[gname]; history != nil {
for tname := range threads {
if oldcounts, ok := history[tname]; ok {
counts := threads[tname]
counts.Add(Delta(oldcounts.Counts))
threads[tname] = counts
}
}
}
g.threadAccum[gname] = threads
for _, thr := range threads {
ret = append(ret, thr)
}
return ret
}

View file

@ -2,18 +2,21 @@ package proc
import (
"fmt"
"os"
"path/filepath"
"strconv"
"time"
"github.com/prometheus/procfs"
"github.com/ncabatoff/procfs"
)
func newProcIdStatic(pid, ppid int, startTime uint64, name string, cmdline []string) ProcIdStatic {
return ProcIdStatic{ProcId{pid, startTime}, ProcStatic{name, cmdline, ppid, time.Time{}}}
}
// ErrProcNotExist indicates a process couldn't be read because it doesn't exist,
// typically because it disappeared while we were reading it.
var ErrProcNotExist = fmt.Errorf("process does not exist")
type (
// ProcId uniquely identifies a process.
ProcId struct {
// ID uniquely identifies a process.
ID struct {
// UNIX process id
Pid int
// The time the process started after system boot, the value is expressed
@ -21,82 +24,138 @@ type (
StartTimeRel uint64
}
// ProcStatic contains data read from /proc/pid/*
ProcStatic struct {
Name string
Cmdline []string
ParentPid int
StartTime time.Time
ThreadID ID
// Static contains data read from /proc/pid/*
Static struct {
Name string
Cmdline []string
ParentPid int
StartTime time.Time
EffectiveUID int
}
// ProcMetrics contains data read from /proc/pid/*
ProcMetrics struct {
CpuTime float64
ReadBytes uint64
WriteBytes uint64
// Counts are metric counters common to threads and processes and groups.
Counts struct {
CPUUserTime float64
CPUSystemTime float64
ReadBytes uint64
WriteBytes uint64
MajorPageFaults uint64
MinorPageFaults uint64
CtxSwitchVoluntary uint64
CtxSwitchNonvoluntary uint64
}
// Memory describes a proc's memory usage.
Memory struct {
ResidentBytes uint64
VirtualBytes uint64
OpenFDs uint64
MaxFDs uint64
VmSwapBytes uint64
}
ProcIdStatic struct {
ProcId
ProcStatic
// Filedesc describes a proc's file descriptor usage and soft limit.
Filedesc struct {
// Open is the count of open file descriptors, -1 if unknown.
Open int64
// Limit is the fd soft limit for the process.
Limit uint64
}
ProcInfo struct {
ProcStatic
ProcMetrics
// States counts how many threads are in each state.
States struct {
Running int
Sleeping int
Waiting int
Zombie int
Other int
}
ProcIdInfo struct {
ProcId
ProcStatic
ProcMetrics
// Metrics contains data read from /proc/pid/*
Metrics struct {
Counts
Memory
Filedesc
NumThreads uint64
States
Wchan string
}
// Thread contains per-thread data.
Thread struct {
ThreadID
ThreadName string
Counts
Wchan string
States
}
// IDInfo groups all info for a single process.
IDInfo struct {
ID
Static
Metrics
Threads []Thread
}
// ProcIdInfoThreads struct {
// ProcIdInfo
// Threads []ProcThread
// }
// Proc wraps the details of the underlying procfs-reading library.
// Any of these methods may fail if the process has disapeared.
// We try to return as much as possible rather than an error, e.g.
// if some /proc files are unreadable.
Proc interface {
// GetPid() returns the POSIX PID (process id). They may be reused over time.
GetPid() int
// GetProcId() returns (pid,starttime), which can be considered a unique process id.
// It may fail if the caller doesn't have permission to read /proc/<pid>/stat, or if
// the process has disapeared.
GetProcId() (ProcId, error)
// GetProcID() returns (pid,starttime), which can be considered a unique process id.
GetProcID() (ID, error)
// GetStatic() returns various details read from files under /proc/<pid>/. Technically
// name may not be static, but we'll pretend it is.
// It may fail if the caller doesn't have permission to read those files, or if
// the process has disapeared.
GetStatic() (ProcStatic, error)
GetStatic() (Static, error)
// GetMetrics() returns various metrics read from files under /proc/<pid>/.
// It may fail if the caller doesn't have permission to read those files, or if
// the process has disapeared.
GetMetrics() (ProcMetrics, error)
// It returns an error on complete failure. Otherwise, it returns metrics
// and 0 on complete success, 1 if some (like I/O) couldn't be read.
GetMetrics() (Metrics, int, error)
GetStates() (States, error)
GetWchan() (string, error)
GetCounts() (Counts, int, error)
GetThreads() ([]Thread, error)
}
// proc is a wrapper for procfs.Proc that caches results of some reads and implements Proc.
proc struct {
// proccache implements the Proc interface by acting as wrapper for procfs.Proc
// that caches results of some reads.
proccache struct {
procfs.Proc
procid *ProcId
stat *procfs.ProcStat
cmdline []string
io *procfs.ProcIO
bootTime uint64
procid *ID
stat *procfs.ProcStat
status *procfs.ProcStatus
cmdline []string
io *procfs.ProcIO
fs *FS
wchan *string
}
proc struct {
proccache
}
// procs is a fancier []Proc that saves on some copying.
procs interface {
get(int) Proc
length() int
}
// procfsprocs implements procs using procfs.
procfsprocs struct {
Procs []procfs.Proc
bootTime uint64
Procs []procfs.Proc
fs *FS
}
// ProcIter is an iterator over a sequence of procs.
ProcIter interface {
// Iter is an iterator over a sequence of procs.
Iter interface {
// Next returns true if the iterator is not exhausted.
Next() bool
// Close releases any resources the iterator uses.
@ -105,7 +164,7 @@ type (
Proc
}
// procIterator implements the ProcIter interface using procfs.
// procIterator implements the Iter interface
procIterator struct {
// procs is the list of Proc we're iterating over.
procs
@ -119,66 +178,101 @@ type (
Proc
}
procIdInfos []ProcIdInfo
// Source is a source of procs.
Source interface {
// AllProcs returns all the processes in this source at this moment in time.
AllProcs() Iter
}
// FS implements Source.
FS struct {
procfs.FS
BootTime uint64
MountPoint string
debug bool
}
)
func procInfoIter(ps ...ProcIdInfo) ProcIter {
return &procIterator{procs: procIdInfos(ps), idx: -1}
func (ii IDInfo) String() string {
return fmt.Sprintf("%+v:%+v", ii.ID, ii.Static)
}
func Info(p Proc) (ProcIdInfo, error) {
id, err := p.GetProcId()
if err != nil {
return ProcIdInfo{}, err
}
static, err := p.GetStatic()
if err != nil {
return ProcIdInfo{}, err
}
metrics, err := p.GetMetrics()
if err != nil {
return ProcIdInfo{}, err
}
return ProcIdInfo{id, static, metrics}, nil
// Add adds c2 to the counts.
func (c *Counts) Add(c2 Delta) {
c.CPUUserTime += c2.CPUUserTime
c.CPUSystemTime += c2.CPUSystemTime
c.ReadBytes += c2.ReadBytes
c.WriteBytes += c2.WriteBytes
c.MajorPageFaults += c2.MajorPageFaults
c.MinorPageFaults += c2.MinorPageFaults
c.CtxSwitchVoluntary += c2.CtxSwitchVoluntary
c.CtxSwitchNonvoluntary += c2.CtxSwitchNonvoluntary
}
func (p procIdInfos) get(i int) Proc {
return &p[i]
// Sub subtracts c2 from the counts.
func (c Counts) Sub(c2 Counts) Delta {
c.CPUUserTime -= c2.CPUUserTime
c.CPUSystemTime -= c2.CPUSystemTime
c.ReadBytes -= c2.ReadBytes
c.WriteBytes -= c2.WriteBytes
c.MajorPageFaults -= c2.MajorPageFaults
c.MinorPageFaults -= c2.MinorPageFaults
c.CtxSwitchVoluntary -= c2.CtxSwitchVoluntary
c.CtxSwitchNonvoluntary -= c2.CtxSwitchNonvoluntary
return Delta(c)
}
func (p procIdInfos) length() int {
return len(p)
func (s *States) Add(s2 States) {
s.Other += s2.Other
s.Running += s2.Running
s.Sleeping += s2.Sleeping
s.Waiting += s2.Waiting
s.Zombie += s2.Zombie
}
func (p ProcIdInfo) GetPid() int {
return p.ProcId.Pid
func (p IDInfo) GetThreads() ([]Thread, error) {
return p.Threads, nil
}
func (p ProcIdInfo) GetProcId() (ProcId, error) {
return p.ProcId, nil
// GetPid implements Proc.
func (p IDInfo) GetPid() int {
return p.ID.Pid
}
func (p ProcIdInfo) GetStatic() (ProcStatic, error) {
return p.ProcStatic, nil
// GetProcID implements Proc.
func (p IDInfo) GetProcID() (ID, error) {
return p.ID, nil
}
func (p ProcIdInfo) GetMetrics() (ProcMetrics, error) {
return p.ProcMetrics, nil
// GetStatic implements Proc.
func (p IDInfo) GetStatic() (Static, error) {
return p.Static, nil
}
func (p procfsprocs) get(i int) Proc {
return &proc{Proc: p.Procs[i], bootTime: p.bootTime}
// GetCounts implements Proc.
func (p IDInfo) GetCounts() (Counts, int, error) {
return p.Metrics.Counts, 0, nil
}
func (p procfsprocs) length() int {
return len(p.Procs)
// GetMetrics implements Proc.
func (p IDInfo) GetMetrics() (Metrics, int, error) {
return p.Metrics, 0, nil
}
func (p *proc) GetPid() int {
// GetStates implements Proc.
func (p IDInfo) GetStates() (States, error) {
return p.States, nil
}
func (p IDInfo) GetWchan() (string, error) {
return p.Wchan, nil
}
func (p *proccache) GetPid() int {
return p.Proc.PID
}
func (p *proc) GetStat() (procfs.ProcStat, error) {
func (p *proccache) getStat() (procfs.ProcStat, error) {
if p.stat == nil {
stat, err := p.Proc.NewStat()
if err != nil {
@ -190,19 +284,32 @@ func (p *proc) GetStat() (procfs.ProcStat, error) {
return *p.stat, nil
}
func (p *proc) GetProcId() (ProcId, error) {
if p.procid == nil {
stat, err := p.GetStat()
func (p *proccache) getStatus() (procfs.ProcStatus, error) {
if p.status == nil {
status, err := p.Proc.NewStatus()
if err != nil {
return ProcId{}, err
return procfs.ProcStatus{}, err
}
p.procid = &ProcId{Pid: p.GetPid(), StartTimeRel: stat.Starttime}
p.status = &status
}
return *p.status, nil
}
// GetProcID implements Proc.
func (p *proccache) GetProcID() (ID, error) {
if p.procid == nil {
stat, err := p.getStat()
if err != nil {
return ID{}, err
}
p.procid = &ID{Pid: p.GetPid(), StartTimeRel: stat.Starttime}
}
return *p.procid, nil
}
func (p *proc) GetCmdLine() ([]string, error) {
func (p *proccache) getCmdLine() ([]string, error) {
if p.cmdline == nil {
cmdline, err := p.Proc.CmdLine()
if err != nil {
@ -213,7 +320,18 @@ func (p *proc) GetCmdLine() ([]string, error) {
return p.cmdline, nil
}
func (p *proc) GetIo() (procfs.ProcIO, error) {
func (p *proccache) getWchan() (string, error) {
if p.wchan == nil {
wchan, err := p.Proc.Wchan()
if err != nil {
return "", err
}
p.wchan = &wchan
}
return *p.wchan, nil
}
func (p *proccache) getIo() (procfs.ProcIO, error) {
if p.io == nil {
io, err := p.Proc.NewIO()
if err != nil {
@ -224,56 +342,199 @@ func (p *proc) GetIo() (procfs.ProcIO, error) {
return *p.io, nil
}
func (p proc) GetStatic() (ProcStatic, error) {
cmdline, err := p.GetCmdLine()
// GetStatic returns the ProcStatic corresponding to this proc.
func (p *proccache) GetStatic() (Static, error) {
// /proc/<pid>/cmdline is normally world-readable.
cmdline, err := p.getCmdLine()
if err != nil {
return ProcStatic{}, err
return Static{}, err
}
stat, err := p.GetStat()
// /proc/<pid>/stat is normally world-readable.
stat, err := p.getStat()
if err != nil {
return ProcStatic{}, err
return Static{}, err
}
startTime := time.Unix(int64(p.bootTime), 0)
startTime := time.Unix(int64(p.fs.BootTime), 0).UTC()
startTime = startTime.Add(time.Second / userHZ * time.Duration(stat.Starttime))
return ProcStatic{
Name: stat.Comm,
Cmdline: cmdline,
ParentPid: stat.PPID,
StartTime: startTime,
// /proc/<pid>/status is normally world-readable.
status, err := p.getStatus()
if err != nil {
return Static{}, err
}
return Static{
Name: stat.Comm,
Cmdline: cmdline,
ParentPid: stat.PPID,
StartTime: startTime,
EffectiveUID: status.UIDEffective,
}, nil
}
func (p proc) GetMetrics() (ProcMetrics, error) {
io, err := p.GetIo()
func (p proc) GetCounts() (Counts, int, error) {
stat, err := p.getStat()
if err != nil {
return ProcMetrics{}, err
if err == os.ErrNotExist {
err = ErrProcNotExist
}
return Counts{}, 0, err
}
stat, err := p.GetStat()
status, err := p.getStatus()
if err != nil {
return ProcMetrics{}, err
if err == os.ErrNotExist {
err = ErrProcNotExist
}
return Counts{}, 0, err
}
io, err := p.getIo()
softerrors := 0
if err != nil {
softerrors++
}
return Counts{
CPUUserTime: float64(stat.UTime) / userHZ,
CPUSystemTime: float64(stat.STime) / userHZ,
ReadBytes: io.ReadBytes,
WriteBytes: io.WriteBytes,
MajorPageFaults: uint64(stat.MajFlt),
MinorPageFaults: uint64(stat.MinFlt),
CtxSwitchVoluntary: uint64(status.VoluntaryCtxtSwitches),
CtxSwitchNonvoluntary: uint64(status.NonvoluntaryCtxtSwitches),
}, softerrors, nil
}
func (p proc) GetWchan() (string, error) {
return p.getWchan()
}
func (p proc) GetStates() (States, error) {
stat, err := p.getStat()
if err != nil {
return States{}, err
}
var s States
switch stat.State {
case "R":
s.Running++
case "S":
s.Sleeping++
case "D":
s.Waiting++
case "Z":
s.Zombie++
default:
s.Other++
}
return s, nil
}
// GetMetrics returns the current metrics for the proc. The results are
// not cached.
func (p proc) GetMetrics() (Metrics, int, error) {
counts, softerrors, err := p.GetCounts()
if err != nil {
return Metrics{}, 0, err
}
// We don't need to check for error here because p will have cached
// the successful result of calling getStat in GetCounts.
// Since GetMetrics isn't a pointer receiver method, our callers
// won't see the effect of the caching between calls.
stat, _ := p.getStat()
// Ditto for states
states, _ := p.GetStates()
status, err := p.getStatus()
if err != nil {
return Metrics{}, 0, err
}
numfds, err := p.Proc.FileDescriptorsLen()
if err != nil {
return ProcMetrics{}, err
numfds = -1
softerrors |= 1
}
limits, err := p.NewLimits()
limits, err := p.Proc.NewLimits()
if err != nil {
return ProcMetrics{}, err
return Metrics{}, 0, err
}
return ProcMetrics{
CpuTime: stat.CPUTime(),
ReadBytes: io.ReadBytes,
WriteBytes: io.WriteBytes,
ResidentBytes: uint64(stat.ResidentMemory()),
VirtualBytes: uint64(stat.VirtualMemory()),
OpenFDs: uint64(numfds),
MaxFDs: uint64(limits.OpenFiles),
}, nil
wchan, err := p.getWchan()
if err != nil {
softerrors |= 1
}
return Metrics{
Counts: counts,
Memory: Memory{
ResidentBytes: uint64(stat.ResidentMemory()),
VirtualBytes: uint64(stat.VirtualMemory()),
VmSwapBytes: uint64(status.VmSwapKB * 1024),
},
Filedesc: Filedesc{
Open: int64(numfds),
Limit: uint64(limits.OpenFiles),
},
NumThreads: uint64(stat.NumThreads),
States: states,
Wchan: wchan,
}, softerrors, nil
}
type FS struct {
procfs.FS
BootTime uint64
func (p proc) GetThreads() ([]Thread, error) {
fs, err := p.fs.threadFs(p.PID)
if err != nil {
return nil, err
}
threads := []Thread{}
iter := fs.AllProcs()
for iter.Next() {
var id ID
id, err = iter.GetProcID()
if err != nil {
continue
}
var static Static
static, err = iter.GetStatic()
if err != nil {
continue
}
var counts Counts
counts, _, err = iter.GetCounts()
if err != nil {
continue
}
wchan, _ := iter.GetWchan()
states, _ := iter.GetStates()
threads = append(threads, Thread{
ThreadID: ThreadID(id),
ThreadName: static.Name,
Counts: counts,
Wchan: wchan,
States: states,
})
}
err = iter.Close()
if err != nil {
return nil, err
}
if len(threads) < 2 {
return nil, nil
}
return threads, nil
}
// See https://github.com/prometheus/procfs/blob/master/proc_stat.go for details on userHZ.
@ -281,7 +542,7 @@ const userHZ = 100
// NewFS returns a new FS mounted under the given mountPoint. It will error
// if the mount point can't be read.
func NewFS(mountPoint string) (*FS, error) {
func NewFS(mountPoint string, debug bool) (*FS, error) {
fs, err := procfs.NewFS(mountPoint)
if err != nil {
return nil, err
@ -290,17 +551,38 @@ func NewFS(mountPoint string) (*FS, error) {
if err != nil {
return nil, err
}
return &FS{fs, stat.BootTime}, nil
return &FS{fs, stat.BootTime, mountPoint, debug}, nil
}
func (fs *FS) AllProcs() ProcIter {
func (fs *FS) threadFs(pid int) (*FS, error) {
mountPoint := filepath.Join(fs.MountPoint, strconv.Itoa(pid), "task")
tfs, err := procfs.NewFS(mountPoint)
if err != nil {
return nil, err
}
return &FS{tfs, fs.BootTime, mountPoint, false}, nil
}
// AllProcs implements Source.
func (fs *FS) AllProcs() Iter {
procs, err := fs.FS.AllProcs()
if err != nil {
err = fmt.Errorf("Error reading procs: %v", err)
}
return &procIterator{procs: procfsprocs{procs, fs.BootTime}, err: err, idx: -1}
return &procIterator{procs: procfsprocs{procs, fs}, err: err, idx: -1}
}
// get implements procs.
func (p procfsprocs) get(i int) Proc {
return &proc{proccache{Proc: p.Procs[i], fs: p.fs}}
}
// length implements procs.
func (p procfsprocs) length() int {
return len(p.Procs)
}
// Next implements Iter.
func (pi *procIterator) Next() bool {
pi.idx++
if pi.idx < pi.procs.length() {
@ -311,6 +593,7 @@ func (pi *procIterator) Next() bool {
return pi.idx < pi.procs.length()
}
// Close implements Iter.
func (pi *procIterator) Close() error {
pi.Next()
pi.procs = nil

View file

@ -2,179 +2,432 @@ package proc
import (
"fmt"
"os"
"log"
"os/user"
"strconv"
"time"
seq "github.com/ncabatoff/go-seq/seq"
common "github.com/ncabatoff/process-exporter"
)
type (
Counts struct {
Cpu float64
ReadBytes uint64
WriteBytes uint64
}
Memory struct {
Resident uint64
Virtual uint64
}
Filedesc struct {
Open uint64
Limit uint64
}
// Tracker tracks processes and records metrics.
Tracker struct {
// Tracked holds the processes are being monitored. Processes
// namer determines what processes to track and names them
namer common.MatchNamer
// tracked holds the processes are being monitored. Processes
// may be blacklisted such that they no longer get tracked by
// setting their value in the Tracked map to nil.
Tracked map[ProcId]*TrackedProc
// ProcIds is a map from pid to ProcId. This is a convenience
// setting their value in the tracked map to nil.
tracked map[ID]*trackedProc
// procIds is a map from pid to ProcId. This is a convenience
// to allow finding the Tracked entry of a parent process.
ProcIds map[int]ProcId
procIds map[int]ID
// trackChildren makes Tracker track descendants of procs the
// namer wanted tracked.
trackChildren bool
// never ignore processes, i.e. always re-check untracked processes in case comm has changed
alwaysRecheck bool
username map[int]string
debug bool
}
// TrackedProc accumulates metrics for a process, as well as
// Delta is an alias of Counts used to signal that its contents are not
// totals, but rather the result of subtracting two totals.
Delta Counts
trackedThread struct {
name string
accum Counts
latest Delta
lastUpdate time.Time
wchan string
}
// trackedProc accumulates metrics for a process, as well as
// remembering an optional GroupName tag associated with it.
TrackedProc struct {
trackedProc struct {
// lastUpdate is used internally during the update cycle to find which procs have exited
lastUpdate time.Time
// info is the most recently obtained info for this proc
info ProcInfo
// accum is the total CPU and IO accrued since we started tracking this proc
accum Counts
// lastaccum is the CPU and IO accrued in the last Update()
lastaccum Counts
// GroupName is an optional tag for this proc.
GroupName string
// static
static Static
metrics Metrics
// lastaccum is the increment to the counters seen in the last update.
lastaccum Delta
// groupName is the tag for this proc given by the namer.
groupName string
threads map[ThreadID]trackedThread
}
trackedStats struct {
aggregate, latest Counts
// ThreadUpdate describes what's changed for a thread since the last cycle.
ThreadUpdate struct {
// ThreadName is the name of the thread based on field of stat.
ThreadName string
// Latest is how much the counts increased since last cycle.
Latest Delta
}
// Update reports on the latest stats for a process.
Update struct {
// GroupName is the name given by the namer to the process.
GroupName string
// Latest is how much the counts increased since last cycle.
Latest Delta
// Memory is the current memory usage.
Memory
// Filedesc is the current fd usage/limit.
Filedesc
start time.Time
// Start is the time the process started.
Start time.Time
// NumThreads is the number of threads.
NumThreads uint64
// States is how many processes are in which run state.
States
// Wchans is how many threads are in each non-zero wchan.
Wchans map[string]int
// Threads are the thread updates for this process.
Threads []ThreadUpdate
}
// CollectErrors describes non-fatal errors found while collecting proc
// metrics.
CollectErrors struct {
// Read is incremented every time GetMetrics() returns an error.
// This means we failed to load even the basics for the process,
// and not just because it disappeared on us.
Read int
// Partial is incremented every time we're unable to collect
// some metrics (e.g. I/O) for a tracked proc, but we're still able
// to get the basic stuff like cmdline and core stats.
Partial int
}
)
func (tp *TrackedProc) GetName() string {
return tp.info.Name
func lessUpdateGroupName(x, y Update) bool { return x.GroupName < y.GroupName }
func lessThreadUpdate(x, y ThreadUpdate) bool { return seq.Compare(x, y) < 0 }
func lessCounts(x, y Counts) bool { return seq.Compare(x, y) < 0 }
func (tp *trackedProc) getUpdate() Update {
u := Update{
GroupName: tp.groupName,
Latest: tp.lastaccum,
Memory: tp.metrics.Memory,
Filedesc: tp.metrics.Filedesc,
Start: tp.static.StartTime,
NumThreads: tp.metrics.NumThreads,
States: tp.metrics.States,
Wchans: make(map[string]int),
}
if tp.metrics.Wchan != "" {
u.Wchans[tp.metrics.Wchan] = 1
}
if len(tp.threads) > 1 {
for _, tt := range tp.threads {
u.Threads = append(u.Threads, ThreadUpdate{tt.name, tt.latest})
if tt.wchan != "" {
u.Wchans[tt.wchan]++
}
}
}
return u
}
func (tp *TrackedProc) GetCmdLine() []string {
return tp.info.Cmdline
}
func (tp *TrackedProc) GetStats() trackedStats {
mem := Memory{Resident: tp.info.ResidentBytes, Virtual: tp.info.VirtualBytes}
fd := Filedesc{Open: tp.info.OpenFDs, Limit: tp.info.MaxFDs}
return trackedStats{
aggregate: tp.accum,
latest: tp.lastaccum,
Memory: mem,
Filedesc: fd,
start: tp.info.StartTime,
// NewTracker creates a Tracker.
func NewTracker(namer common.MatchNamer, trackChildren, alwaysRecheck, debug bool) *Tracker {
return &Tracker{
namer: namer,
tracked: make(map[ID]*trackedProc),
procIds: make(map[int]ID),
trackChildren: trackChildren,
alwaysRecheck: alwaysRecheck,
username: make(map[int]string),
debug: debug,
}
}
func NewTracker() *Tracker {
return &Tracker{Tracked: make(map[ProcId]*TrackedProc), ProcIds: make(map[int]ProcId)}
func (t *Tracker) track(groupName string, idinfo IDInfo) {
tproc := trackedProc{
groupName: groupName,
static: idinfo.Static,
metrics: idinfo.Metrics,
}
if len(idinfo.Threads) > 0 {
tproc.threads = make(map[ThreadID]trackedThread)
for _, thr := range idinfo.Threads {
tproc.threads[thr.ThreadID] = trackedThread{
thr.ThreadName, thr.Counts, Delta{}, time.Time{}, thr.Wchan}
}
}
t.tracked[idinfo.ID] = &tproc
}
func (t *Tracker) Track(groupName string, idinfo ProcIdInfo) {
info := ProcInfo{idinfo.ProcStatic, idinfo.ProcMetrics}
t.Tracked[idinfo.ProcId] = &TrackedProc{GroupName: groupName, info: info}
func (t *Tracker) ignore(id ID) {
// only ignore ID if we didn't set recheck to true
if t.alwaysRecheck == false {
t.tracked[id] = nil
}
}
func (t *Tracker) Ignore(id ProcId) {
t.Tracked[id] = nil
func (tp *trackedProc) update(metrics Metrics, now time.Time, cerrs *CollectErrors, threads []Thread) {
// newcounts: resource consumption since last cycle
newcounts := metrics.Counts
tp.lastaccum = newcounts.Sub(tp.metrics.Counts)
tp.metrics = metrics
tp.lastUpdate = now
if len(threads) > 1 {
if tp.threads == nil {
tp.threads = make(map[ThreadID]trackedThread)
}
for _, thr := range threads {
tt := trackedThread{thr.ThreadName, thr.Counts, Delta{}, now, thr.Wchan}
if old, ok := tp.threads[thr.ThreadID]; ok {
tt.latest, tt.accum = thr.Counts.Sub(old.accum), thr.Counts
}
tp.threads[thr.ThreadID] = tt
}
for id, tt := range tp.threads {
if tt.lastUpdate != now {
delete(tp.threads, id)
}
}
} else {
tp.threads = nil
}
}
// Scan procs and update metrics for those which are tracked. Processes that have gone
// away get removed from the Tracked map. New processes are returned, along with the count
// of permission errors.
func (t *Tracker) Update(procs ProcIter) ([]ProcIdInfo, int, error) {
now := time.Now()
var newProcs []ProcIdInfo
var permissionErrors int
// handleProc updates the tracker if it's a known and not ignored proc.
// If it's neither known nor ignored, newProc will be non-nil.
// It is not an error if the process disappears while we are reading
// its info out of /proc, it just means nothing will be returned and
// the tracker will be unchanged.
func (t *Tracker) handleProc(proc Proc, updateTime time.Time) (*IDInfo, CollectErrors) {
var cerrs CollectErrors
procID, err := proc.GetProcID()
if err != nil {
return nil, cerrs
}
// Do nothing if we're ignoring this proc.
last, known := t.tracked[procID]
if known && last == nil {
return nil, cerrs
}
metrics, softerrors, err := proc.GetMetrics()
if err != nil {
if t.debug {
log.Printf("error reading metrics for %+v: %v", procID, err)
}
// This usually happens due to the proc having exited, i.e.
// we lost the race. We don't count that as an error.
if err != ErrProcNotExist {
cerrs.Read++
}
return nil, cerrs
}
var threads []Thread
threads, err = proc.GetThreads()
if err != nil {
softerrors |= 1
}
cerrs.Partial += softerrors
if len(threads) > 0 {
metrics.Counts.CtxSwitchNonvoluntary, metrics.Counts.CtxSwitchVoluntary = 0, 0
for _, thread := range threads {
metrics.Counts.CtxSwitchNonvoluntary += thread.Counts.CtxSwitchNonvoluntary
metrics.Counts.CtxSwitchVoluntary += thread.Counts.CtxSwitchVoluntary
metrics.States.Add(thread.States)
}
}
var newProc *IDInfo
if known {
last.update(metrics, updateTime, &cerrs, threads)
} else {
static, err := proc.GetStatic()
if err != nil {
if t.debug {
log.Printf("error reading static details for %+v: %v", procID, err)
}
return nil, cerrs
}
newProc = &IDInfo{procID, static, metrics, threads}
if t.debug {
log.Printf("found new proc: %s", newProc)
}
// Is this a new process with the same pid as one we already know?
// Then delete it from the known map, otherwise the cleanup in Update()
// will remove the ProcIds entry we're creating here.
if oldProcID, ok := t.procIds[procID.Pid]; ok {
delete(t.tracked, oldProcID)
}
t.procIds[procID.Pid] = procID
}
return newProc, cerrs
}
// update scans procs and updates metrics for those which are tracked. Processes
// that have gone away get removed from the Tracked map. New processes are
// returned, along with the count of nonfatal errors.
func (t *Tracker) update(procs Iter) ([]IDInfo, CollectErrors, error) {
var newProcs []IDInfo
var colErrs CollectErrors
var now = time.Now()
for procs.Next() {
procId, err := procs.GetProcId()
if err != nil {
continue
newProc, cerrs := t.handleProc(procs, now)
if newProc != nil {
newProcs = append(newProcs, *newProc)
}
last, known := t.Tracked[procId]
// Are we ignoring this proc?
if known && last == nil {
continue
}
// TODO if just the io file is unreadable, should we still return the other metrics?
metrics, err := procs.GetMetrics()
if err != nil {
if os.IsPermission(err) {
permissionErrors++
t.Ignore(procId)
}
continue
}
if known {
var newaccum, lastaccum Counts
dcpu := metrics.CpuTime - last.info.CpuTime
drbytes := metrics.ReadBytes - last.info.ReadBytes
dwbytes := metrics.WriteBytes - last.info.WriteBytes
lastaccum = Counts{Cpu: dcpu, ReadBytes: drbytes, WriteBytes: dwbytes}
newaccum = Counts{
Cpu: last.accum.Cpu + lastaccum.Cpu,
ReadBytes: last.accum.ReadBytes + lastaccum.ReadBytes,
WriteBytes: last.accum.WriteBytes + lastaccum.WriteBytes,
}
last.info.ProcMetrics = metrics
last.lastUpdate = now
last.accum = newaccum
last.lastaccum = lastaccum
} else {
static, err := procs.GetStatic()
if err != nil {
continue
}
newProcs = append(newProcs, ProcIdInfo{procId, static, metrics})
// Is this a new process with the same pid as one we already know?
if oldProcId, ok := t.ProcIds[procId.Pid]; ok {
// Delete it from known, otherwise the cleanup below will remove the
// ProcIds entry we're about to create
delete(t.Tracked, oldProcId)
}
t.ProcIds[procId.Pid] = procId
}
colErrs.Read += cerrs.Read
colErrs.Partial += cerrs.Partial
}
err := procs.Close()
if err != nil {
return nil, permissionErrors, fmt.Errorf("Error reading procs: %v", err)
return nil, colErrs, fmt.Errorf("Error reading procs: %v", err)
}
// Rather than allocating a new map each time to detect procs that have
// disappeared, we bump the last update time on those that are still
// present. Then as a second pass we traverse the map looking for
// stale procs and removing them.
for procId, pinfo := range t.Tracked {
for procID, pinfo := range t.tracked {
if pinfo == nil {
// TODO is this a bug? we're not tracking the proc so we don't see it go away so ProcIds
// and Tracked are leaking?
continue
}
if pinfo.lastUpdate != now {
delete(t.Tracked, procId)
delete(t.ProcIds, procId.Pid)
delete(t.tracked, procID)
delete(t.procIds, procID.Pid)
}
}
return newProcs, permissionErrors, nil
return newProcs, colErrs, nil
}
// checkAncestry walks the process tree recursively towards the root,
// stopping at pid 1 or upon finding a parent that's already tracked
// or ignored. If we find a tracked parent track this one too; if not,
// ignore this one.
func (t *Tracker) checkAncestry(idinfo IDInfo, newprocs map[ID]IDInfo) string {
ppid := idinfo.ParentPid
pProcID := t.procIds[ppid]
if pProcID.Pid < 1 {
if t.debug {
log.Printf("ignoring unmatched proc with no matched parent: %+v", idinfo)
}
// Reached root of process tree without finding a tracked parent.
t.ignore(idinfo.ID)
return ""
}
// Is the parent already known to the tracker?
if ptproc, ok := t.tracked[pProcID]; ok {
if ptproc != nil {
if t.debug {
log.Printf("matched as %q because child of %+v: %+v",
ptproc.groupName, pProcID, idinfo)
}
// We've found a tracked parent.
t.track(ptproc.groupName, idinfo)
return ptproc.groupName
}
// We've found an untracked parent.
t.ignore(idinfo.ID)
return ""
}
// Is the parent another new process?
if pinfoid, ok := newprocs[pProcID]; ok {
if name := t.checkAncestry(pinfoid, newprocs); name != "" {
if t.debug {
log.Printf("matched as %q because child of %+v: %+v",
name, pProcID, idinfo)
}
// We've found a tracked parent, which implies this entire lineage should be tracked.
t.track(name, idinfo)
return name
}
}
// Parent is dead, i.e. we never saw it, or there's no tracked proc in our ancestry.
if t.debug {
log.Printf("ignoring unmatched proc with no matched parent: %+v", idinfo)
}
t.ignore(idinfo.ID)
return ""
}
func (t *Tracker) lookupUid(uid int) string {
if name, ok := t.username[uid]; ok {
return name
}
var name string
uidstr := strconv.Itoa(uid)
u, err := user.LookupId(uidstr)
if err != nil {
name = uidstr
} else {
name = u.Username
}
t.username[uid] = name
return name
}
// Update modifies the tracker's internal state based on what it reads from
// iter. Tracks any new procs the namer wants tracked, and updates
// its metrics for existing tracked procs. Returns nonfatal errors
// and the status of all tracked procs, or an error if fatal.
func (t *Tracker) Update(iter Iter) (CollectErrors, []Update, error) {
newProcs, colErrs, err := t.update(iter)
if err != nil {
return colErrs, nil, err
}
// Step 1: track any new proc that should be tracked based on its name and cmdline.
untracked := make(map[ID]IDInfo)
for _, idinfo := range newProcs {
nacl := common.ProcAttributes{
Name: idinfo.Name,
Cmdline: idinfo.Cmdline,
Username: t.lookupUid(idinfo.EffectiveUID),
}
wanted, gname := t.namer.MatchAndName(nacl)
if wanted {
if t.debug {
log.Printf("matched as %q: %+v", gname, idinfo)
}
t.track(gname, idinfo)
} else {
untracked[idinfo.ID] = idinfo
}
}
// Step 2: track any untracked new proc that should be tracked because its parent is tracked.
if t.trackChildren {
for _, idinfo := range untracked {
if _, ok := t.tracked[idinfo.ID]; ok {
// Already tracked or ignored in an earlier iteration
continue
}
t.checkAncestry(idinfo, untracked)
}
}
tp := []Update{}
for _, tproc := range t.tracked {
if tproc != nil {
tp = append(tp, tproc.getUpdate())
}
}
return colErrs, tp, nil
}