batch metrics and flush periodically

2018-08-17 20:01:50 -04:00 · 2018-08-17 20:01:50 -04:00 · 2207d7694d
commit 2207d7694d
parent b78bb2524e
7 changed files with 298 additions and 265 deletions
--- a/rootfs/etc/nginx/lua/monitor.lua
+++ b/rootfs/etc/nginx/lua/monitor.lua
@ -1,50 +1,83 @@
 local socket = ngx.socket.tcp
 local cjson = require('cjson')
-local defer = require('util.defer')
 local assert = assert

+local metrics_batch = {}
+-- if an Nginx worker processes more than (MAX_BATCH_SIZE/FLUSH_INTERVAL) RPS then it will start dropping metrics
+local MAX_BATCH_SIZE = 10000
+local FLUSH_INTERVAL = 1 -- second
+
 local _M = {}

-local function send_data(jsonData)
+local function send(payload)
  local s = assert(socket())
-  assert(s:connect('unix:/tmp/prometheus-nginx.socket'))
-  assert(s:send(jsonData))
+  assert(s:connect("unix:/tmp/prometheus-nginx.socket"))
+  assert(s:send(payload))
  assert(s:close())
 end

-function _M.encode_nginx_stats()
-  return cjson.encode({
+local function metrics()
+  return {
    host = ngx.var.host or "-",
-
-    method = ngx.var.request_method or "-",
+    namespace = ngx.var.namespace or "-",
+    ingress = ngx.var.ingress_name or "-",
+    service = ngx.var.service_name or "-",
    path = ngx.var.location_path or "-",

+    method = ngx.var.request_method or "-",
    status = ngx.var.status or "-",
-
    requestLength = tonumber(ngx.var.request_length) or -1,
    requestTime = tonumber(ngx.var.request_time) or -1,
-
    responseLength = tonumber(ngx.var.bytes_sent) or -1,

    endpoint = ngx.var.upstream_addr or "-",
-
    upstreamLatency = tonumber(ngx.var.upstream_connect_time) or -1,
    upstreamResponseTime = tonumber(ngx.var.upstream_response_time) or -1,
    upstreamResponseLength = tonumber(ngx.var.upstream_response_length) or -1,
    upstreamStatus = ngx.var.upstream_status or "-",
-
-    namespace = ngx.var.namespace or "-",
-    ingress = ngx.var.ingress_name or "-",
-    service = ngx.var.service_name or "-",
-  })
+  }
 end

-function _M.call()
-  local ok, err = defer.to_timer_phase(send_data, _M.encode_nginx_stats())
-  if not ok then
-    ngx.log(ngx.ERR, "failed to defer send_data to timer phase: ", err)
+local function flush(premature)
+  if premature then
    return
  end
+
+  if #metrics_batch == 0 then
+    return
+  end
+
+  local current_metrics_batch = metrics_batch
+  metrics_batch = {}
+
+  local ok, payload = pcall(cjson.encode, current_metrics_batch)
+  if not ok then
+    ngx.log(ngx.ERR, "error while encoding metrics: " .. tostring(payload))
+    return
+  end
+
+  send(payload)
+end
+
+function _M.init_worker()
+  local _, err = ngx.timer.every(FLUSH_INTERVAL, flush)
+  if err then
+    ngx.log(ngx.ERR, string.format("error when setting up timer.every: %s", tostring(err)))
+  end
+end
+
+function _M.call()
+  if #metrics_batch >= MAX_BATCH_SIZE then
+    ngx.log(ngx.WARN, "omiting metrics for the request, current batch is full")
+    return
+  end
+
+  table.insert(metrics_batch, metrics())
+end
+
+if _TEST then
+  _M.flush = flush
+  _M.get_metrics_batch = function() return metrics_batch end
 end

 return _M
--- a/rootfs/etc/nginx/lua/test/defer_test.lua
+++ b/rootfs/etc/nginx/lua/test/defer_test.lua
@ -1,12 +0,0 @@
-_G._TEST = true
-local defer = require("util.defer")
-
-describe("Defer", function()
-   describe("to_timer_phase", function()
-    it("executes passed callback immediately if called on timer phase", function()
-        defer.counter = 0
-        defer.to_timer_phase(function() defer.counter = defer.counter + 1 end)
-        assert.equal(defer.counter, 1)
-    end)
-   end)
-end)
--- a/rootfs/etc/nginx/lua/test/monitor_test.lua
+++ b/rootfs/etc/nginx/lua/test/monitor_test.lua
@ -1,96 +1,106 @@
 _G._TEST = true
-local cjson = require("cjson")
+
+local original_ngx = ngx
+local function reset_ngx()
+  _G.ngx = original_ngx
+end
+
+local function mock_ngx(mock)
+  local _ngx = mock
+  setmetatable(_ngx, { __index = ngx })
+  _G.ngx = _ngx
+end
+
+local function mock_ngx_socket_tcp()
+  local tcp_mock = {}
+  stub(tcp_mock, "connect", true)
+  stub(tcp_mock, "send", true)
+  stub(tcp_mock, "close", true)
+
+  local socket_mock = {}
+  stub(socket_mock, "tcp", tcp_mock)
+  mock_ngx({ socket = socket_mock })
+
+  return tcp_mock
+end

 describe("Monitor", function()
+  after_each(function()
+    reset_ngx()
+    package.loaded["monitor"] = nil
+  end)
+
+  it("batches metrics", function()
    local monitor = require("monitor")
-    describe("encode_nginx_stats()", function()
-        it("successfuly encodes the current stats of nginx to JSON", function()
-            local nginx_environment = {
-                host = "testshop.com",
-                status = "200",
-                bytes_sent = "150",
-                server_protocol = "HTTP",
-                request_method = "GET",
-                location_path = "/admin",
-                request_length = "300",
-                request_time = "210",
-                proxy_upstream_name = "test-upstream",
-                upstream_addr = "2.2.2.2",
-                upstream_response_time = "200",
-                upstream_response_length = "150",
-                upstream_connect_time = "1",
-                upstream_status = "220",
-                namespace = "test-app-production",
-                ingress_name = "web-yml",
-                service_name = "test-app",
-            }
-            ngx.var = nginx_environment
+    mock_ngx({ var = {} })

-            local encode_nginx_stats = monitor.encode_nginx_stats
-            local encoded_json_stats = encode_nginx_stats()
-            local decoded_json_stats = cjson.decode(encoded_json_stats)
+    for i = 1,10,1 do
+      monitor.call()
+    end

-            local expected_json_stats = {
-                host = "testshop.com",
-                status = "200",
-                responseLength = 150.0,
-                method = "GET",
-                path = "/admin",
-                requestLength = 300.0,
-                requestTime = 210.0,
-                endpoint = "2.2.2.2",
-                upstreamResponseTime = 200,
-                upstreamStatus = "220",
-                upstreamLatency = 1.0,
-                upstreamResponseLength = 150.0,
-                namespace = "test-app-production",
-                ingress = "web-yml",
-                service = "test-app",
-            }
+    assert.equal(10, #monitor.get_metrics_batch())
+  end)

-            assert.are.same(decoded_json_stats,expected_json_stats)
-        end)
+  describe("flush", function()
+    it("short circuits when premmature is true (when worker is shutting down)", function()
+      local tcp_mock = mock_ngx_socket_tcp()
+      local monitor = require("monitor")
+      mock_ngx({ var = {} })

-        it("replaces empty numeric keys with -1 and missing string keys with -", function()
-            local nginx_environment = {
-                remote_addr = "10.10.10.10",
-                realip_remote_addr = "5.5.5.5",
-                remote_user = "francisco",
-                server_protocol = "HTTP",
-                request_method = "GET",
-                location_path = "/admin",
-                request_time = "202",
-                proxy_upstream_name = "test-upstream",
-                upstream_addr = "2.2.2.2",
-                upstream_response_time = "201",
-                upstream_status = "220",
-                ingress_name = "web-yml",
-            }
-            ngx.var = nginx_environment
-
-            local encode_nginx_stats = monitor.encode_nginx_stats
-            local encoded_json_stats = encode_nginx_stats()
-            local decoded_json_stats = cjson.decode(encoded_json_stats)
-
-            local expected_json_stats = {
-                host = "-",
-                status = "-",
-                responseLength = -1,
-                method = "GET",
-                path = "/admin",
-                requestLength = -1,
-                requestTime = 202.0,
-                endpoint = "2.2.2.2",
-                upstreamStatus = "220",
-                namespace = "-",
-                ingress = "web-yml",
-                upstreamLatency = -1,
-                upstreamResponseTime = 201,
-                upstreamResponseLength = -1,
-                responseLength = -1,
-                service = "-",
-            }
-            assert.are.same(decoded_json_stats,expected_json_stats)
-        end)
+      for i = 1,10,1 do
+        monitor.call()
+      end
+      monitor.flush(true)
+      assert.stub(tcp_mock.connect).was_not_called()
    end)
+
+    it("short circuits when there's no metrics batched", function()
+      local tcp_mock = mock_ngx_socket_tcp()
+      local monitor = require("monitor")
+
+      monitor.flush()
+      assert.stub(tcp_mock.connect).was_not_called()
+    end)
+
+    it("JSON encodes and sends the batched metrics", function()
+      local tcp_mock = mock_ngx_socket_tcp()
+      local monitor = require("monitor")
+
+      local ngx_var_mock = {
+        host = "example.com",
+        namespace = "default",
+        ingress_name = "example",
+        service_name = "http-svc",
+        location_path = "/",
+
+        request_method = "GET",
+        status = "200",
+        request_length = "256",
+        request_time = "0.04",
+        bytes_sent = "512",
+
+        upstream_addr = "10.10.0.1",
+        upstream_connect_time = "0.01",
+        upstream_response_time = "0.02",
+        upstream_response_length = "456",
+        upstream_status = "200",
+      }
+      mock_ngx({ var = ngx_var_mock })
+      monitor.call()
+
+      local ngx_var_mock1 = ngx_var_mock
+      ngx_var_mock1.status = "201"
+      ngx_var_mock1.request_method = "POST"
+      mock_ngx({ var = ngx_var_mock })
+      monitor.call()
+
+      monitor.flush()
+
+      local expected_payload = '[{"upstreamStatus":"200","requestLength":256,"ingress":"example","status":"200","service":"http-svc","requestTime":0.04,"namespace":"default","host":"example.com","method":"GET","upstreamResponseTime":0.02,"upstreamResponseLength":456,"endpoint":"10.10.0.1","upstreamLatency":0.01,"path":"\\/","responseLength":512},{"upstreamStatus":"200","requestLength":256,"ingress":"example","status":"201","service":"http-svc","requestTime":0.04,"namespace":"default","host":"example.com","method":"POST","upstreamResponseTime":0.02,"upstreamResponseLength":456,"endpoint":"10.10.0.1","upstreamLatency":0.01,"path":"\\/","responseLength":512}]'
+
+      assert.stub(tcp_mock.connect).was_called_with(tcp_mock, "unix:/tmp/prometheus-nginx.socket")
+      assert.stub(tcp_mock.send).was_called_with(tcp_mock, expected_payload)
+      assert.stub(tcp_mock.close).was_called_with(tcp_mock)
+    end)
+  end)
 end)
--- a/rootfs/etc/nginx/lua/util/defer.lua
+++ b/rootfs/etc/nginx/lua/util/defer.lua
@ -1,57 +0,0 @@
-local util = require("util")
-
-local timer_started = false
-local queue = {}
-local MAX_QUEUE_SIZE = 10000
-
-local _M = {}
-
-local function flush_queue(premature)
-  -- TODO Investigate if we should actually still flush the queue when we're
-  -- shutting down.
-  if premature then return end
-
-  local current_queue = queue
-  queue = {}
-  timer_started = false
-
-  for _,v in ipairs(current_queue) do
-    v.func(unpack(v.args))
-  end
-end
-
-- `to_timer_phase` will enqueue a function that will be executed in a timer
-- context, at a later point in time. The purpose is that some APIs (such as
-- sockets) are not available during some nginx request phases (such as the
-- logging phase), but are available for use in timers. There are no ordering
-- guarantees for when a function will be executed.
-function _M.to_timer_phase(func, ...)
-  if ngx.get_phase() == "timer" then
-    func(...)
-    return true
-  end
-
-  if #queue >= MAX_QUEUE_SIZE then
-    ngx.log(ngx.ERR, "deferred timer queue full")
-    return nil, "deferred timer queue full"
-  end
-
-  table.insert(queue, { func = func, args = {...} })
-  if not timer_started then
-    local ok, err = ngx.timer.at(0, flush_queue)
-    if ok then
-      -- unfortunately this is to deal with tests - when running unit tests, we
-      -- dont actually run the timer, we call the function inline
-      if util.tablelength(queue) > 0 then
-        timer_started = true
-      end
-    else
-      local msg = "failed to create timer: " .. tostring(err)
-      ngx.log(ngx.ERR, msg)
-      return nil, msg
-    end
-  end
-  return true
-end
-
-return _M