Update go dependencies

This commit is contained in:
Manuel Alejandro de Brito Fontes 2018-12-05 13:27:09 -03:00
parent 432f534383
commit f4a4daed84
1299 changed files with 71186 additions and 91183 deletions

View file

@ -1,173 +1,179 @@
package proc
import (
common "github.com/ncabatoff/process-exporter"
"time"
seq "github.com/ncabatoff/go-seq/seq"
common "github.com/ncabatoff/process-exporter"
)
type (
// Grouper is the top-level interface to the process metrics. All tracked
// procs sharing the same group name are aggregated.
Grouper struct {
namer common.MatchNamer
trackChildren bool
// track how much was seen last time so we can report the delta
GroupStats map[string]Counts
tracker *Tracker
// groupAccum records the historical accumulation of a group so that
// we can avoid ever decreasing the counts we return.
groupAccum map[string]Counts
tracker *Tracker
threadAccum map[string]map[string]Threads
debug bool
}
GroupCountMap map[string]GroupCounts
// GroupByName maps group name to group metrics.
GroupByName map[string]Group
GroupCounts struct {
// Threads collects metrics for threads in a group sharing a thread name.
Threads struct {
Name string
NumThreads int
Counts
Procs int
Memresident uint64
Memvirtual uint64
}
// Group describes the metrics of a single group.
Group struct {
Counts
States
Wchans map[string]int
Procs int
Memory
OldestStartTime time.Time
OpenFDs uint64
WorstFDratio float64
NumThreads uint64
Threads []Threads
}
)
func NewGrouper(trackChildren bool, namer common.MatchNamer) *Grouper {
// Returns true if x < y. Test designers should ensure they always have
// a unique name/numthreads combination for each group.
func lessThreads(x, y Threads) bool { return seq.Compare(x, y) < 0 }
// NewGrouper creates a grouper.
func NewGrouper(namer common.MatchNamer, trackChildren, alwaysRecheck, debug bool) *Grouper {
g := Grouper{
trackChildren: trackChildren,
namer: namer,
GroupStats: make(map[string]Counts),
tracker: NewTracker(),
groupAccum: make(map[string]Counts),
threadAccum: make(map[string]map[string]Threads),
tracker: NewTracker(namer, trackChildren, alwaysRecheck, debug),
debug: debug,
}
return &g
}
func (g *Grouper) checkAncestry(idinfo ProcIdInfo, newprocs map[ProcId]ProcIdInfo) string {
ppid := idinfo.ParentPid
pProcId := g.tracker.ProcIds[ppid]
if pProcId.Pid < 1 {
// Reached root of process tree without finding a tracked parent.
g.tracker.Ignore(idinfo.ProcId)
return ""
}
// Is the parent already known to the tracker?
if ptproc, ok := g.tracker.Tracked[pProcId]; ok {
if ptproc != nil {
// We've found a tracked parent.
g.tracker.Track(ptproc.GroupName, idinfo)
return ptproc.GroupName
} else {
// We've found an untracked parent.
g.tracker.Ignore(idinfo.ProcId)
return ""
}
}
// Is the parent another new process?
if pinfoid, ok := newprocs[pProcId]; ok {
if name := g.checkAncestry(pinfoid, newprocs); name != "" {
// We've found a tracked parent, which implies this entire lineage should be tracked.
g.tracker.Track(name, idinfo)
return name
}
}
// Parent is dead, i.e. we never saw it, or there's no tracked proc in our ancestry.
g.tracker.Ignore(idinfo.ProcId)
return ""
}
// Update tracks any new procs that should be according to policy, and updates
// the metrics for already tracked procs. Permission errors are returned as a
// count, and will not affect the error return value.
func (g *Grouper) Update(iter ProcIter) (int, error) {
newProcs, permErrs, err := g.tracker.Update(iter)
if err != nil {
return permErrs, err
}
// Step 1: track any new proc that should be tracked based on its name and cmdline.
untracked := make(map[ProcId]ProcIdInfo)
for _, idinfo := range newProcs {
wanted, gname := g.namer.MatchAndName(common.NameAndCmdline{Name: idinfo.Name, Cmdline: idinfo.Cmdline})
if !wanted {
untracked[idinfo.ProcId] = idinfo
continue
}
g.tracker.Track(gname, idinfo)
}
// Step 2: track any untracked new proc that should be tracked because its parent is tracked.
if !g.trackChildren {
return permErrs, nil
}
for _, idinfo := range untracked {
if _, ok := g.tracker.Tracked[idinfo.ProcId]; ok {
// Already tracked or ignored
continue
}
g.checkAncestry(idinfo, untracked)
}
return permErrs, nil
}
// groups returns the aggregate metrics for all groups tracked. This reflects
// solely what's currently running.
func (g *Grouper) groups() GroupCountMap {
gcounts := make(GroupCountMap)
func groupadd(grp Group, ts Update) Group {
var zeroTime time.Time
for _, tinfo := range g.tracker.Tracked {
if tinfo == nil {
continue
}
cur := gcounts[tinfo.GroupName]
cur.Procs++
tstats := tinfo.GetStats()
cur.Memresident += tstats.Memory.Resident
cur.Memvirtual += tstats.Memory.Virtual
cur.OpenFDs += tstats.Filedesc.Open
openratio := float64(tstats.Filedesc.Open) / float64(tstats.Filedesc.Limit)
if cur.WorstFDratio < openratio {
cur.WorstFDratio = openratio
}
cur.Counts.Cpu += tstats.latest.Cpu
cur.Counts.ReadBytes += tstats.latest.ReadBytes
cur.Counts.WriteBytes += tstats.latest.WriteBytes
if cur.OldestStartTime == zeroTime || tstats.start.Before(cur.OldestStartTime) {
cur.OldestStartTime = tstats.start
}
gcounts[tinfo.GroupName] = cur
grp.Procs++
grp.Memory.ResidentBytes += ts.Memory.ResidentBytes
grp.Memory.VirtualBytes += ts.Memory.VirtualBytes
grp.Memory.VmSwapBytes += ts.Memory.VmSwapBytes
if ts.Filedesc.Open != -1 {
grp.OpenFDs += uint64(ts.Filedesc.Open)
}
openratio := float64(ts.Filedesc.Open) / float64(ts.Filedesc.Limit)
if grp.WorstFDratio < openratio {
grp.WorstFDratio = openratio
}
grp.NumThreads += ts.NumThreads
grp.Counts.Add(ts.Latest)
grp.States.Add(ts.States)
if grp.OldestStartTime == zeroTime || ts.Start.Before(grp.OldestStartTime) {
grp.OldestStartTime = ts.Start
}
return gcounts
if grp.Wchans == nil {
grp.Wchans = make(map[string]int)
}
for wchan, count := range ts.Wchans {
grp.Wchans[wchan] += count
}
return grp
}
// Groups returns GroupCounts with Counts that never decrease in value from one
// call to the next. Even if processes exit, their CPU and IO contributions up
// to that point are included in the results. Even if no processes remain
// in a group it will still be included in the results.
func (g *Grouper) Groups() GroupCountMap {
groups := g.groups()
// Update asks the tracker to report on each tracked process by name.
// These are aggregated by groupname, augmented by accumulated counts
// from the past, and returned. Note that while the Tracker reports
// only what counts have changed since last cycle, Grouper.Update
// returns counts that never decrease. Even once the last process
// with name X disappears, name X will still appear in the results
// with the same counts as before; of course, all non-count metrics
// will be zero.
func (g *Grouper) Update(iter Iter) (CollectErrors, GroupByName, error) {
cerrs, tracked, err := g.tracker.Update(iter)
if err != nil {
return cerrs, nil, err
}
return cerrs, g.groups(tracked), nil
}
// First add any accumulated counts to what was just observed,
// Translate the updates into a new GroupByName and update internal history.
func (g *Grouper) groups(tracked []Update) GroupByName {
groups := make(GroupByName)
threadsByGroup := make(map[string][]ThreadUpdate)
for _, update := range tracked {
groups[update.GroupName] = groupadd(groups[update.GroupName], update)
if update.Threads != nil {
threadsByGroup[update.GroupName] =
append(threadsByGroup[update.GroupName], update.Threads...)
}
}
// Add any accumulated counts to what was just observed,
// and update the accumulators.
for gname, group := range groups {
if oldcounts, ok := g.GroupStats[gname]; ok {
group.Counts.Cpu += oldcounts.Cpu
group.Counts.ReadBytes += oldcounts.ReadBytes
group.Counts.WriteBytes += oldcounts.WriteBytes
if oldcounts, ok := g.groupAccum[gname]; ok {
group.Counts.Add(Delta(oldcounts))
}
g.GroupStats[gname] = group.Counts
g.groupAccum[gname] = group.Counts
group.Threads = g.threads(gname, threadsByGroup[gname])
groups[gname] = group
}
// Now add any groups that were observed in the past but aren't running now.
for gname, gcounts := range g.GroupStats {
for gname, gcounts := range g.groupAccum {
if _, ok := groups[gname]; !ok {
groups[gname] = GroupCounts{Counts: gcounts}
groups[gname] = Group{Counts: gcounts}
}
}
return groups
}
func (g *Grouper) threads(gname string, tracked []ThreadUpdate) []Threads {
if len(tracked) == 0 {
delete(g.threadAccum, gname)
return nil
}
ret := make([]Threads, 0, len(tracked))
threads := make(map[string]Threads)
// First aggregate the thread metrics by thread name.
for _, nc := range tracked {
curthr := threads[nc.ThreadName]
curthr.NumThreads++
curthr.Counts.Add(nc.Latest)
curthr.Name = nc.ThreadName
threads[nc.ThreadName] = curthr
}
// Add any accumulated counts to what was just observed,
// and update the accumulators.
if history := g.threadAccum[gname]; history != nil {
for tname := range threads {
if oldcounts, ok := history[tname]; ok {
counts := threads[tname]
counts.Add(Delta(oldcounts.Counts))
threads[tname] = counts
}
}
}
g.threadAccum[gname] = threads
for _, thr := range threads {
ret = append(ret, thr)
}
return ret
}

View file

@ -2,18 +2,21 @@ package proc
import (
"fmt"
"os"
"path/filepath"
"strconv"
"time"
"github.com/prometheus/procfs"
"github.com/ncabatoff/procfs"
)
func newProcIdStatic(pid, ppid int, startTime uint64, name string, cmdline []string) ProcIdStatic {
return ProcIdStatic{ProcId{pid, startTime}, ProcStatic{name, cmdline, ppid, time.Time{}}}
}
// ErrProcNotExist indicates a process couldn't be read because it doesn't exist,
// typically because it disappeared while we were reading it.
var ErrProcNotExist = fmt.Errorf("process does not exist")
type (
// ProcId uniquely identifies a process.
ProcId struct {
// ID uniquely identifies a process.
ID struct {
// UNIX process id
Pid int
// The time the process started after system boot, the value is expressed
@ -21,82 +24,138 @@ type (
StartTimeRel uint64
}
// ProcStatic contains data read from /proc/pid/*
ProcStatic struct {
Name string
Cmdline []string
ParentPid int
StartTime time.Time
ThreadID ID
// Static contains data read from /proc/pid/*
Static struct {
Name string
Cmdline []string
ParentPid int
StartTime time.Time
EffectiveUID int
}
// ProcMetrics contains data read from /proc/pid/*
ProcMetrics struct {
CpuTime float64
ReadBytes uint64
WriteBytes uint64
// Counts are metric counters common to threads and processes and groups.
Counts struct {
CPUUserTime float64
CPUSystemTime float64
ReadBytes uint64
WriteBytes uint64
MajorPageFaults uint64
MinorPageFaults uint64
CtxSwitchVoluntary uint64
CtxSwitchNonvoluntary uint64
}
// Memory describes a proc's memory usage.
Memory struct {
ResidentBytes uint64
VirtualBytes uint64
OpenFDs uint64
MaxFDs uint64
VmSwapBytes uint64
}
ProcIdStatic struct {
ProcId
ProcStatic
// Filedesc describes a proc's file descriptor usage and soft limit.
Filedesc struct {
// Open is the count of open file descriptors, -1 if unknown.
Open int64
// Limit is the fd soft limit for the process.
Limit uint64
}
ProcInfo struct {
ProcStatic
ProcMetrics
// States counts how many threads are in each state.
States struct {
Running int
Sleeping int
Waiting int
Zombie int
Other int
}
ProcIdInfo struct {
ProcId
ProcStatic
ProcMetrics
// Metrics contains data read from /proc/pid/*
Metrics struct {
Counts
Memory
Filedesc
NumThreads uint64
States
Wchan string
}
// Thread contains per-thread data.
Thread struct {
ThreadID
ThreadName string
Counts
Wchan string
States
}
// IDInfo groups all info for a single process.
IDInfo struct {
ID
Static
Metrics
Threads []Thread
}
// ProcIdInfoThreads struct {
// ProcIdInfo
// Threads []ProcThread
// }
// Proc wraps the details of the underlying procfs-reading library.
// Any of these methods may fail if the process has disapeared.
// We try to return as much as possible rather than an error, e.g.
// if some /proc files are unreadable.
Proc interface {
// GetPid() returns the POSIX PID (process id). They may be reused over time.
GetPid() int
// GetProcId() returns (pid,starttime), which can be considered a unique process id.
// It may fail if the caller doesn't have permission to read /proc/<pid>/stat, or if
// the process has disapeared.
GetProcId() (ProcId, error)
// GetProcID() returns (pid,starttime), which can be considered a unique process id.
GetProcID() (ID, error)
// GetStatic() returns various details read from files under /proc/<pid>/. Technically
// name may not be static, but we'll pretend it is.
// It may fail if the caller doesn't have permission to read those files, or if
// the process has disapeared.
GetStatic() (ProcStatic, error)
GetStatic() (Static, error)
// GetMetrics() returns various metrics read from files under /proc/<pid>/.
// It may fail if the caller doesn't have permission to read those files, or if
// the process has disapeared.
GetMetrics() (ProcMetrics, error)
// It returns an error on complete failure. Otherwise, it returns metrics
// and 0 on complete success, 1 if some (like I/O) couldn't be read.
GetMetrics() (Metrics, int, error)
GetStates() (States, error)
GetWchan() (string, error)
GetCounts() (Counts, int, error)
GetThreads() ([]Thread, error)
}
// proc is a wrapper for procfs.Proc that caches results of some reads and implements Proc.
proc struct {
// proccache implements the Proc interface by acting as wrapper for procfs.Proc
// that caches results of some reads.
proccache struct {
procfs.Proc
procid *ProcId
stat *procfs.ProcStat
cmdline []string
io *procfs.ProcIO
bootTime uint64
procid *ID
stat *procfs.ProcStat
status *procfs.ProcStatus
cmdline []string
io *procfs.ProcIO
fs *FS
wchan *string
}
proc struct {
proccache
}
// procs is a fancier []Proc that saves on some copying.
procs interface {
get(int) Proc
length() int
}
// procfsprocs implements procs using procfs.
procfsprocs struct {
Procs []procfs.Proc
bootTime uint64
Procs []procfs.Proc
fs *FS
}
// ProcIter is an iterator over a sequence of procs.
ProcIter interface {
// Iter is an iterator over a sequence of procs.
Iter interface {
// Next returns true if the iterator is not exhausted.
Next() bool
// Close releases any resources the iterator uses.
@ -105,7 +164,7 @@ type (
Proc
}
// procIterator implements the ProcIter interface using procfs.
// procIterator implements the Iter interface
procIterator struct {
// procs is the list of Proc we're iterating over.
procs
@ -119,66 +178,101 @@ type (
Proc
}
procIdInfos []ProcIdInfo
// Source is a source of procs.
Source interface {
// AllProcs returns all the processes in this source at this moment in time.
AllProcs() Iter
}
// FS implements Source.
FS struct {
procfs.FS
BootTime uint64
MountPoint string
debug bool
}
)
func procInfoIter(ps ...ProcIdInfo) ProcIter {
return &procIterator{procs: procIdInfos(ps), idx: -1}
func (ii IDInfo) String() string {
return fmt.Sprintf("%+v:%+v", ii.ID, ii.Static)
}
func Info(p Proc) (ProcIdInfo, error) {
id, err := p.GetProcId()
if err != nil {
return ProcIdInfo{}, err
}
static, err := p.GetStatic()
if err != nil {
return ProcIdInfo{}, err
}
metrics, err := p.GetMetrics()
if err != nil {
return ProcIdInfo{}, err
}
return ProcIdInfo{id, static, metrics}, nil
// Add adds c2 to the counts.
func (c *Counts) Add(c2 Delta) {
c.CPUUserTime += c2.CPUUserTime
c.CPUSystemTime += c2.CPUSystemTime
c.ReadBytes += c2.ReadBytes
c.WriteBytes += c2.WriteBytes
c.MajorPageFaults += c2.MajorPageFaults
c.MinorPageFaults += c2.MinorPageFaults
c.CtxSwitchVoluntary += c2.CtxSwitchVoluntary
c.CtxSwitchNonvoluntary += c2.CtxSwitchNonvoluntary
}
func (p procIdInfos) get(i int) Proc {
return &p[i]
// Sub subtracts c2 from the counts.
func (c Counts) Sub(c2 Counts) Delta {
c.CPUUserTime -= c2.CPUUserTime
c.CPUSystemTime -= c2.CPUSystemTime
c.ReadBytes -= c2.ReadBytes
c.WriteBytes -= c2.WriteBytes
c.MajorPageFaults -= c2.MajorPageFaults
c.MinorPageFaults -= c2.MinorPageFaults
c.CtxSwitchVoluntary -= c2.CtxSwitchVoluntary
c.CtxSwitchNonvoluntary -= c2.CtxSwitchNonvoluntary
return Delta(c)
}
func (p procIdInfos) length() int {
return len(p)
func (s *States) Add(s2 States) {
s.Other += s2.Other
s.Running += s2.Running
s.Sleeping += s2.Sleeping
s.Waiting += s2.Waiting
s.Zombie += s2.Zombie
}
func (p ProcIdInfo) GetPid() int {
return p.ProcId.Pid
func (p IDInfo) GetThreads() ([]Thread, error) {
return p.Threads, nil
}
func (p ProcIdInfo) GetProcId() (ProcId, error) {
return p.ProcId, nil
// GetPid implements Proc.
func (p IDInfo) GetPid() int {
return p.ID.Pid
}
func (p ProcIdInfo) GetStatic() (ProcStatic, error) {
return p.ProcStatic, nil
// GetProcID implements Proc.
func (p IDInfo) GetProcID() (ID, error) {
return p.ID, nil
}
func (p ProcIdInfo) GetMetrics() (ProcMetrics, error) {
return p.ProcMetrics, nil
// GetStatic implements Proc.
func (p IDInfo) GetStatic() (Static, error) {
return p.Static, nil
}
func (p procfsprocs) get(i int) Proc {
return &proc{Proc: p.Procs[i], bootTime: p.bootTime}
// GetCounts implements Proc.
func (p IDInfo) GetCounts() (Counts, int, error) {
return p.Metrics.Counts, 0, nil
}
func (p procfsprocs) length() int {
return len(p.Procs)
// GetMetrics implements Proc.
func (p IDInfo) GetMetrics() (Metrics, int, error) {
return p.Metrics, 0, nil
}
func (p *proc) GetPid() int {
// GetStates implements Proc.
func (p IDInfo) GetStates() (States, error) {
return p.States, nil
}
func (p IDInfo) GetWchan() (string, error) {
return p.Wchan, nil
}
func (p *proccache) GetPid() int {
return p.Proc.PID
}
func (p *proc) GetStat() (procfs.ProcStat, error) {
func (p *proccache) getStat() (procfs.ProcStat, error) {
if p.stat == nil {
stat, err := p.Proc.NewStat()
if err != nil {
@ -190,19 +284,32 @@ func (p *proc) GetStat() (procfs.ProcStat, error) {
return *p.stat, nil
}
func (p *proc) GetProcId() (ProcId, error) {
if p.procid == nil {
stat, err := p.GetStat()
func (p *proccache) getStatus() (procfs.ProcStatus, error) {
if p.status == nil {
status, err := p.Proc.NewStatus()
if err != nil {
return ProcId{}, err
return procfs.ProcStatus{}, err
}
p.procid = &ProcId{Pid: p.GetPid(), StartTimeRel: stat.Starttime}
p.status = &status
}
return *p.status, nil
}
// GetProcID implements Proc.
func (p *proccache) GetProcID() (ID, error) {
if p.procid == nil {
stat, err := p.getStat()
if err != nil {
return ID{}, err
}
p.procid = &ID{Pid: p.GetPid(), StartTimeRel: stat.Starttime}
}
return *p.procid, nil
}
func (p *proc) GetCmdLine() ([]string, error) {
func (p *proccache) getCmdLine() ([]string, error) {
if p.cmdline == nil {
cmdline, err := p.Proc.CmdLine()
if err != nil {
@ -213,7 +320,18 @@ func (p *proc) GetCmdLine() ([]string, error) {
return p.cmdline, nil
}
func (p *proc) GetIo() (procfs.ProcIO, error) {
func (p *proccache) getWchan() (string, error) {
if p.wchan == nil {
wchan, err := p.Proc.Wchan()
if err != nil {
return "", err
}
p.wchan = &wchan
}
return *p.wchan, nil
}
func (p *proccache) getIo() (procfs.ProcIO, error) {
if p.io == nil {
io, err := p.Proc.NewIO()
if err != nil {
@ -224,56 +342,199 @@ func (p *proc) GetIo() (procfs.ProcIO, error) {
return *p.io, nil
}
func (p proc) GetStatic() (ProcStatic, error) {
cmdline, err := p.GetCmdLine()
// GetStatic returns the ProcStatic corresponding to this proc.
func (p *proccache) GetStatic() (Static, error) {
// /proc/<pid>/cmdline is normally world-readable.
cmdline, err := p.getCmdLine()
if err != nil {
return ProcStatic{}, err
return Static{}, err
}
stat, err := p.GetStat()
// /proc/<pid>/stat is normally world-readable.
stat, err := p.getStat()
if err != nil {
return ProcStatic{}, err
return Static{}, err
}
startTime := time.Unix(int64(p.bootTime), 0)
startTime := time.Unix(int64(p.fs.BootTime), 0).UTC()
startTime = startTime.Add(time.Second / userHZ * time.Duration(stat.Starttime))
return ProcStatic{
Name: stat.Comm,
Cmdline: cmdline,
ParentPid: stat.PPID,
StartTime: startTime,
// /proc/<pid>/status is normally world-readable.
status, err := p.getStatus()
if err != nil {
return Static{}, err
}
return Static{
Name: stat.Comm,
Cmdline: cmdline,
ParentPid: stat.PPID,
StartTime: startTime,
EffectiveUID: status.UIDEffective,
}, nil
}
func (p proc) GetMetrics() (ProcMetrics, error) {
io, err := p.GetIo()
func (p proc) GetCounts() (Counts, int, error) {
stat, err := p.getStat()
if err != nil {
return ProcMetrics{}, err
if err == os.ErrNotExist {
err = ErrProcNotExist
}
return Counts{}, 0, err
}
stat, err := p.GetStat()
status, err := p.getStatus()
if err != nil {
return ProcMetrics{}, err
if err == os.ErrNotExist {
err = ErrProcNotExist
}
return Counts{}, 0, err
}
io, err := p.getIo()
softerrors := 0
if err != nil {
softerrors++
}
return Counts{
CPUUserTime: float64(stat.UTime) / userHZ,
CPUSystemTime: float64(stat.STime) / userHZ,
ReadBytes: io.ReadBytes,
WriteBytes: io.WriteBytes,
MajorPageFaults: uint64(stat.MajFlt),
MinorPageFaults: uint64(stat.MinFlt),
CtxSwitchVoluntary: uint64(status.VoluntaryCtxtSwitches),
CtxSwitchNonvoluntary: uint64(status.NonvoluntaryCtxtSwitches),
}, softerrors, nil
}
func (p proc) GetWchan() (string, error) {
return p.getWchan()
}
func (p proc) GetStates() (States, error) {
stat, err := p.getStat()
if err != nil {
return States{}, err
}
var s States
switch stat.State {
case "R":
s.Running++
case "S":
s.Sleeping++
case "D":
s.Waiting++
case "Z":
s.Zombie++
default:
s.Other++
}
return s, nil
}
// GetMetrics returns the current metrics for the proc. The results are
// not cached.
func (p proc) GetMetrics() (Metrics, int, error) {
counts, softerrors, err := p.GetCounts()
if err != nil {
return Metrics{}, 0, err
}
// We don't need to check for error here because p will have cached
// the successful result of calling getStat in GetCounts.
// Since GetMetrics isn't a pointer receiver method, our callers
// won't see the effect of the caching between calls.
stat, _ := p.getStat()
// Ditto for states
states, _ := p.GetStates()
status, err := p.getStatus()
if err != nil {
return Metrics{}, 0, err
}
numfds, err := p.Proc.FileDescriptorsLen()
if err != nil {
return ProcMetrics{}, err
numfds = -1
softerrors |= 1
}
limits, err := p.NewLimits()
limits, err := p.Proc.NewLimits()
if err != nil {
return ProcMetrics{}, err
return Metrics{}, 0, err
}
return ProcMetrics{
CpuTime: stat.CPUTime(),
ReadBytes: io.ReadBytes,
WriteBytes: io.WriteBytes,
ResidentBytes: uint64(stat.ResidentMemory()),
VirtualBytes: uint64(stat.VirtualMemory()),
OpenFDs: uint64(numfds),
MaxFDs: uint64(limits.OpenFiles),
}, nil
wchan, err := p.getWchan()
if err != nil {
softerrors |= 1
}
return Metrics{
Counts: counts,
Memory: Memory{
ResidentBytes: uint64(stat.ResidentMemory()),
VirtualBytes: uint64(stat.VirtualMemory()),
VmSwapBytes: uint64(status.VmSwapKB * 1024),
},
Filedesc: Filedesc{
Open: int64(numfds),
Limit: uint64(limits.OpenFiles),
},
NumThreads: uint64(stat.NumThreads),
States: states,
Wchan: wchan,
}, softerrors, nil
}
type FS struct {
procfs.FS
BootTime uint64
func (p proc) GetThreads() ([]Thread, error) {
fs, err := p.fs.threadFs(p.PID)
if err != nil {
return nil, err
}
threads := []Thread{}
iter := fs.AllProcs()
for iter.Next() {
var id ID
id, err = iter.GetProcID()
if err != nil {
continue
}
var static Static
static, err = iter.GetStatic()
if err != nil {
continue
}
var counts Counts
counts, _, err = iter.GetCounts()
if err != nil {
continue
}
wchan, _ := iter.GetWchan()
states, _ := iter.GetStates()
threads = append(threads, Thread{
ThreadID: ThreadID(id),
ThreadName: static.Name,
Counts: counts,
Wchan: wchan,
States: states,
})
}
err = iter.Close()
if err != nil {
return nil, err
}
if len(threads) < 2 {
return nil, nil
}
return threads, nil
}
// See https://github.com/prometheus/procfs/blob/master/proc_stat.go for details on userHZ.
@ -281,7 +542,7 @@ const userHZ = 100
// NewFS returns a new FS mounted under the given mountPoint. It will error
// if the mount point can't be read.
func NewFS(mountPoint string) (*FS, error) {
func NewFS(mountPoint string, debug bool) (*FS, error) {
fs, err := procfs.NewFS(mountPoint)
if err != nil {
return nil, err
@ -290,17 +551,38 @@ func NewFS(mountPoint string) (*FS, error) {
if err != nil {
return nil, err
}
return &FS{fs, stat.BootTime}, nil
return &FS{fs, stat.BootTime, mountPoint, debug}, nil
}
func (fs *FS) AllProcs() ProcIter {
func (fs *FS) threadFs(pid int) (*FS, error) {
mountPoint := filepath.Join(fs.MountPoint, strconv.Itoa(pid), "task")
tfs, err := procfs.NewFS(mountPoint)
if err != nil {
return nil, err
}
return &FS{tfs, fs.BootTime, mountPoint, false}, nil
}
// AllProcs implements Source.
func (fs *FS) AllProcs() Iter {
procs, err := fs.FS.AllProcs()
if err != nil {
err = fmt.Errorf("Error reading procs: %v", err)
}
return &procIterator{procs: procfsprocs{procs, fs.BootTime}, err: err, idx: -1}
return &procIterator{procs: procfsprocs{procs, fs}, err: err, idx: -1}
}
// get implements procs.
func (p procfsprocs) get(i int) Proc {
return &proc{proccache{Proc: p.Procs[i], fs: p.fs}}
}
// length implements procs.
func (p procfsprocs) length() int {
return len(p.Procs)
}
// Next implements Iter.
func (pi *procIterator) Next() bool {
pi.idx++
if pi.idx < pi.procs.length() {
@ -311,6 +593,7 @@ func (pi *procIterator) Next() bool {
return pi.idx < pi.procs.length()
}
// Close implements Iter.
func (pi *procIterator) Close() error {
pi.Next()
pi.procs = nil

View file

@ -2,179 +2,432 @@ package proc
import (
"fmt"
"os"
"log"
"os/user"
"strconv"
"time"
seq "github.com/ncabatoff/go-seq/seq"
common "github.com/ncabatoff/process-exporter"
)
type (
Counts struct {
Cpu float64
ReadBytes uint64
WriteBytes uint64
}
Memory struct {
Resident uint64
Virtual uint64
}
Filedesc struct {
Open uint64
Limit uint64
}
// Tracker tracks processes and records metrics.
Tracker struct {
// Tracked holds the processes are being monitored. Processes
// namer determines what processes to track and names them
namer common.MatchNamer
// tracked holds the processes are being monitored. Processes
// may be blacklisted such that they no longer get tracked by
// setting their value in the Tracked map to nil.
Tracked map[ProcId]*TrackedProc
// ProcIds is a map from pid to ProcId. This is a convenience
// setting their value in the tracked map to nil.
tracked map[ID]*trackedProc
// procIds is a map from pid to ProcId. This is a convenience
// to allow finding the Tracked entry of a parent process.
ProcIds map[int]ProcId
procIds map[int]ID
// trackChildren makes Tracker track descendants of procs the
// namer wanted tracked.
trackChildren bool
// never ignore processes, i.e. always re-check untracked processes in case comm has changed
alwaysRecheck bool
username map[int]string
debug bool
}
// TrackedProc accumulates metrics for a process, as well as
// Delta is an alias of Counts used to signal that its contents are not
// totals, but rather the result of subtracting two totals.
Delta Counts
trackedThread struct {
name string
accum Counts
latest Delta
lastUpdate time.Time
wchan string
}
// trackedProc accumulates metrics for a process, as well as
// remembering an optional GroupName tag associated with it.
TrackedProc struct {
trackedProc struct {
// lastUpdate is used internally during the update cycle to find which procs have exited
lastUpdate time.Time
// info is the most recently obtained info for this proc
info ProcInfo
// accum is the total CPU and IO accrued since we started tracking this proc
accum Counts
// lastaccum is the CPU and IO accrued in the last Update()
lastaccum Counts
// GroupName is an optional tag for this proc.
GroupName string
// static
static Static
metrics Metrics
// lastaccum is the increment to the counters seen in the last update.
lastaccum Delta
// groupName is the tag for this proc given by the namer.
groupName string
threads map[ThreadID]trackedThread
}
trackedStats struct {
aggregate, latest Counts
// ThreadUpdate describes what's changed for a thread since the last cycle.
ThreadUpdate struct {
// ThreadName is the name of the thread based on field of stat.
ThreadName string
// Latest is how much the counts increased since last cycle.
Latest Delta
}
// Update reports on the latest stats for a process.
Update struct {
// GroupName is the name given by the namer to the process.
GroupName string
// Latest is how much the counts increased since last cycle.
Latest Delta
// Memory is the current memory usage.
Memory
// Filedesc is the current fd usage/limit.
Filedesc
start time.Time
// Start is the time the process started.
Start time.Time
// NumThreads is the number of threads.
NumThreads uint64
// States is how many processes are in which run state.
States
// Wchans is how many threads are in each non-zero wchan.
Wchans map[string]int
// Threads are the thread updates for this process.
Threads []ThreadUpdate
}
// CollectErrors describes non-fatal errors found while collecting proc
// metrics.
CollectErrors struct {
// Read is incremented every time GetMetrics() returns an error.
// This means we failed to load even the basics for the process,
// and not just because it disappeared on us.
Read int
// Partial is incremented every time we're unable to collect
// some metrics (e.g. I/O) for a tracked proc, but we're still able
// to get the basic stuff like cmdline and core stats.
Partial int
}
)
func (tp *TrackedProc) GetName() string {
return tp.info.Name
func lessUpdateGroupName(x, y Update) bool { return x.GroupName < y.GroupName }
func lessThreadUpdate(x, y ThreadUpdate) bool { return seq.Compare(x, y) < 0 }
func lessCounts(x, y Counts) bool { return seq.Compare(x, y) < 0 }
func (tp *trackedProc) getUpdate() Update {
u := Update{
GroupName: tp.groupName,
Latest: tp.lastaccum,
Memory: tp.metrics.Memory,
Filedesc: tp.metrics.Filedesc,
Start: tp.static.StartTime,
NumThreads: tp.metrics.NumThreads,
States: tp.metrics.States,
Wchans: make(map[string]int),
}
if tp.metrics.Wchan != "" {
u.Wchans[tp.metrics.Wchan] = 1
}
if len(tp.threads) > 1 {
for _, tt := range tp.threads {
u.Threads = append(u.Threads, ThreadUpdate{tt.name, tt.latest})
if tt.wchan != "" {
u.Wchans[tt.wchan]++
}
}
}
return u
}
func (tp *TrackedProc) GetCmdLine() []string {
return tp.info.Cmdline
}
func (tp *TrackedProc) GetStats() trackedStats {
mem := Memory{Resident: tp.info.ResidentBytes, Virtual: tp.info.VirtualBytes}
fd := Filedesc{Open: tp.info.OpenFDs, Limit: tp.info.MaxFDs}
return trackedStats{
aggregate: tp.accum,
latest: tp.lastaccum,
Memory: mem,
Filedesc: fd,
start: tp.info.StartTime,
// NewTracker creates a Tracker.
func NewTracker(namer common.MatchNamer, trackChildren, alwaysRecheck, debug bool) *Tracker {
return &Tracker{
namer: namer,
tracked: make(map[ID]*trackedProc),
procIds: make(map[int]ID),
trackChildren: trackChildren,
alwaysRecheck: alwaysRecheck,
username: make(map[int]string),
debug: debug,
}
}
func NewTracker() *Tracker {
return &Tracker{Tracked: make(map[ProcId]*TrackedProc), ProcIds: make(map[int]ProcId)}
func (t *Tracker) track(groupName string, idinfo IDInfo) {
tproc := trackedProc{
groupName: groupName,
static: idinfo.Static,
metrics: idinfo.Metrics,
}
if len(idinfo.Threads) > 0 {
tproc.threads = make(map[ThreadID]trackedThread)
for _, thr := range idinfo.Threads {
tproc.threads[thr.ThreadID] = trackedThread{
thr.ThreadName, thr.Counts, Delta{}, time.Time{}, thr.Wchan}
}
}
t.tracked[idinfo.ID] = &tproc
}
func (t *Tracker) Track(groupName string, idinfo ProcIdInfo) {
info := ProcInfo{idinfo.ProcStatic, idinfo.ProcMetrics}
t.Tracked[idinfo.ProcId] = &TrackedProc{GroupName: groupName, info: info}
func (t *Tracker) ignore(id ID) {
// only ignore ID if we didn't set recheck to true
if t.alwaysRecheck == false {
t.tracked[id] = nil
}
}
func (t *Tracker) Ignore(id ProcId) {
t.Tracked[id] = nil
func (tp *trackedProc) update(metrics Metrics, now time.Time, cerrs *CollectErrors, threads []Thread) {
// newcounts: resource consumption since last cycle
newcounts := metrics.Counts
tp.lastaccum = newcounts.Sub(tp.metrics.Counts)
tp.metrics = metrics
tp.lastUpdate = now
if len(threads) > 1 {
if tp.threads == nil {
tp.threads = make(map[ThreadID]trackedThread)
}
for _, thr := range threads {
tt := trackedThread{thr.ThreadName, thr.Counts, Delta{}, now, thr.Wchan}
if old, ok := tp.threads[thr.ThreadID]; ok {
tt.latest, tt.accum = thr.Counts.Sub(old.accum), thr.Counts
}
tp.threads[thr.ThreadID] = tt
}
for id, tt := range tp.threads {
if tt.lastUpdate != now {
delete(tp.threads, id)
}
}
} else {
tp.threads = nil
}
}
// Scan procs and update metrics for those which are tracked. Processes that have gone
// away get removed from the Tracked map. New processes are returned, along with the count
// of permission errors.
func (t *Tracker) Update(procs ProcIter) ([]ProcIdInfo, int, error) {
now := time.Now()
var newProcs []ProcIdInfo
var permissionErrors int
// handleProc updates the tracker if it's a known and not ignored proc.
// If it's neither known nor ignored, newProc will be non-nil.
// It is not an error if the process disappears while we are reading
// its info out of /proc, it just means nothing will be returned and
// the tracker will be unchanged.
func (t *Tracker) handleProc(proc Proc, updateTime time.Time) (*IDInfo, CollectErrors) {
var cerrs CollectErrors
procID, err := proc.GetProcID()
if err != nil {
return nil, cerrs
}
// Do nothing if we're ignoring this proc.
last, known := t.tracked[procID]
if known && last == nil {
return nil, cerrs
}
metrics, softerrors, err := proc.GetMetrics()
if err != nil {
if t.debug {
log.Printf("error reading metrics for %+v: %v", procID, err)
}
// This usually happens due to the proc having exited, i.e.
// we lost the race. We don't count that as an error.
if err != ErrProcNotExist {
cerrs.Read++
}
return nil, cerrs
}
var threads []Thread
threads, err = proc.GetThreads()
if err != nil {
softerrors |= 1
}
cerrs.Partial += softerrors
if len(threads) > 0 {
metrics.Counts.CtxSwitchNonvoluntary, metrics.Counts.CtxSwitchVoluntary = 0, 0
for _, thread := range threads {
metrics.Counts.CtxSwitchNonvoluntary += thread.Counts.CtxSwitchNonvoluntary
metrics.Counts.CtxSwitchVoluntary += thread.Counts.CtxSwitchVoluntary
metrics.States.Add(thread.States)
}
}
var newProc *IDInfo
if known {
last.update(metrics, updateTime, &cerrs, threads)
} else {
static, err := proc.GetStatic()
if err != nil {
if t.debug {
log.Printf("error reading static details for %+v: %v", procID, err)
}
return nil, cerrs
}
newProc = &IDInfo{procID, static, metrics, threads}
if t.debug {
log.Printf("found new proc: %s", newProc)
}
// Is this a new process with the same pid as one we already know?
// Then delete it from the known map, otherwise the cleanup in Update()
// will remove the ProcIds entry we're creating here.
if oldProcID, ok := t.procIds[procID.Pid]; ok {
delete(t.tracked, oldProcID)
}
t.procIds[procID.Pid] = procID
}
return newProc, cerrs
}
// update scans procs and updates metrics for those which are tracked. Processes
// that have gone away get removed from the Tracked map. New processes are
// returned, along with the count of nonfatal errors.
func (t *Tracker) update(procs Iter) ([]IDInfo, CollectErrors, error) {
var newProcs []IDInfo
var colErrs CollectErrors
var now = time.Now()
for procs.Next() {
procId, err := procs.GetProcId()
if err != nil {
continue
newProc, cerrs := t.handleProc(procs, now)
if newProc != nil {
newProcs = append(newProcs, *newProc)
}
last, known := t.Tracked[procId]
// Are we ignoring this proc?
if known && last == nil {
continue
}
// TODO if just the io file is unreadable, should we still return the other metrics?
metrics, err := procs.GetMetrics()
if err != nil {
if os.IsPermission(err) {
permissionErrors++
t.Ignore(procId)
}
continue
}
if known {
var newaccum, lastaccum Counts
dcpu := metrics.CpuTime - last.info.CpuTime
drbytes := metrics.ReadBytes - last.info.ReadBytes
dwbytes := metrics.WriteBytes - last.info.WriteBytes
lastaccum = Counts{Cpu: dcpu, ReadBytes: drbytes, WriteBytes: dwbytes}
newaccum = Counts{
Cpu: last.accum.Cpu + lastaccum.Cpu,
ReadBytes: last.accum.ReadBytes + lastaccum.ReadBytes,
WriteBytes: last.accum.WriteBytes + lastaccum.WriteBytes,
}
last.info.ProcMetrics = metrics
last.lastUpdate = now
last.accum = newaccum
last.lastaccum = lastaccum
} else {
static, err := procs.GetStatic()
if err != nil {
continue
}
newProcs = append(newProcs, ProcIdInfo{procId, static, metrics})
// Is this a new process with the same pid as one we already know?
if oldProcId, ok := t.ProcIds[procId.Pid]; ok {
// Delete it from known, otherwise the cleanup below will remove the
// ProcIds entry we're about to create
delete(t.Tracked, oldProcId)
}
t.ProcIds[procId.Pid] = procId
}
colErrs.Read += cerrs.Read
colErrs.Partial += cerrs.Partial
}
err := procs.Close()
if err != nil {
return nil, permissionErrors, fmt.Errorf("Error reading procs: %v", err)
return nil, colErrs, fmt.Errorf("Error reading procs: %v", err)
}
// Rather than allocating a new map each time to detect procs that have
// disappeared, we bump the last update time on those that are still
// present. Then as a second pass we traverse the map looking for
// stale procs and removing them.
for procId, pinfo := range t.Tracked {
for procID, pinfo := range t.tracked {
if pinfo == nil {
// TODO is this a bug? we're not tracking the proc so we don't see it go away so ProcIds
// and Tracked are leaking?
continue
}
if pinfo.lastUpdate != now {
delete(t.Tracked, procId)
delete(t.ProcIds, procId.Pid)
delete(t.tracked, procID)
delete(t.procIds, procID.Pid)
}
}
return newProcs, permissionErrors, nil
return newProcs, colErrs, nil
}
// checkAncestry walks the process tree recursively towards the root,
// stopping at pid 1 or upon finding a parent that's already tracked
// or ignored. If we find a tracked parent track this one too; if not,
// ignore this one.
func (t *Tracker) checkAncestry(idinfo IDInfo, newprocs map[ID]IDInfo) string {
ppid := idinfo.ParentPid
pProcID := t.procIds[ppid]
if pProcID.Pid < 1 {
if t.debug {
log.Printf("ignoring unmatched proc with no matched parent: %+v", idinfo)
}
// Reached root of process tree without finding a tracked parent.
t.ignore(idinfo.ID)
return ""
}
// Is the parent already known to the tracker?
if ptproc, ok := t.tracked[pProcID]; ok {
if ptproc != nil {
if t.debug {
log.Printf("matched as %q because child of %+v: %+v",
ptproc.groupName, pProcID, idinfo)
}
// We've found a tracked parent.
t.track(ptproc.groupName, idinfo)
return ptproc.groupName
}
// We've found an untracked parent.
t.ignore(idinfo.ID)
return ""
}
// Is the parent another new process?
if pinfoid, ok := newprocs[pProcID]; ok {
if name := t.checkAncestry(pinfoid, newprocs); name != "" {
if t.debug {
log.Printf("matched as %q because child of %+v: %+v",
name, pProcID, idinfo)
}
// We've found a tracked parent, which implies this entire lineage should be tracked.
t.track(name, idinfo)
return name
}
}
// Parent is dead, i.e. we never saw it, or there's no tracked proc in our ancestry.
if t.debug {
log.Printf("ignoring unmatched proc with no matched parent: %+v", idinfo)
}
t.ignore(idinfo.ID)
return ""
}
func (t *Tracker) lookupUid(uid int) string {
if name, ok := t.username[uid]; ok {
return name
}
var name string
uidstr := strconv.Itoa(uid)
u, err := user.LookupId(uidstr)
if err != nil {
name = uidstr
} else {
name = u.Username
}
t.username[uid] = name
return name
}
// Update modifies the tracker's internal state based on what it reads from
// iter. Tracks any new procs the namer wants tracked, and updates
// its metrics for existing tracked procs. Returns nonfatal errors
// and the status of all tracked procs, or an error if fatal.
func (t *Tracker) Update(iter Iter) (CollectErrors, []Update, error) {
newProcs, colErrs, err := t.update(iter)
if err != nil {
return colErrs, nil, err
}
// Step 1: track any new proc that should be tracked based on its name and cmdline.
untracked := make(map[ID]IDInfo)
for _, idinfo := range newProcs {
nacl := common.ProcAttributes{
Name: idinfo.Name,
Cmdline: idinfo.Cmdline,
Username: t.lookupUid(idinfo.EffectiveUID),
}
wanted, gname := t.namer.MatchAndName(nacl)
if wanted {
if t.debug {
log.Printf("matched as %q: %+v", gname, idinfo)
}
t.track(gname, idinfo)
} else {
untracked[idinfo.ID] = idinfo
}
}
// Step 2: track any untracked new proc that should be tracked because its parent is tracked.
if t.trackChildren {
for _, idinfo := range untracked {
if _, ok := t.tracked[idinfo.ID]; ok {
// Already tracked or ignored in an earlier iteration
continue
}
t.checkAncestry(idinfo, untracked)
}
}
tp := []Update{}
for _, tproc := range t.tracked {
if tproc != nil {
tp = append(tp, tproc.getUpdate())
}
}
return colErrs, tp, nil
}