diff --git a/cmd/containerd/builtins.go b/cmd/containerd/builtins.go index 71bfe42..9620297 100644 --- a/cmd/containerd/builtins.go +++ b/cmd/containerd/builtins.go @@ -6,6 +6,7 @@ import ( _ "github.com/docker/containerd/services/content" _ "github.com/docker/containerd/services/execution" _ "github.com/docker/containerd/services/healthcheck" + _ "github.com/docker/containerd/services/metrics" _ "github.com/docker/containerd/snapshot/btrfs" _ "github.com/docker/containerd/snapshot/overlay" ) diff --git a/cmd/containerd/main.go b/cmd/containerd/main.go index 318ef50..5b69c6a 100644 --- a/cmd/containerd/main.go +++ b/cmd/containerd/main.go @@ -11,6 +11,7 @@ import ( "syscall" "time" + grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" gocontext "golang.org/x/net/context" "google.golang.org/grpc" @@ -296,7 +297,10 @@ func loadSnapshotter(store *content.Store) (snapshot.Snapshotter, error) { } func newGRPCServer() *grpc.Server { - s := grpc.NewServer(grpc.UnaryInterceptor(interceptor)) + s := grpc.NewServer( + grpc.UnaryInterceptor(interceptor), + grpc.StreamInterceptor(grpc_prometheus.StreamServerInterceptor), + ) return s } @@ -365,7 +369,7 @@ func interceptor(ctx gocontext.Context, default: fmt.Printf("unknown GRPC server type: %#v\n", info.Server) } - return handler(ctx, req) + return grpc_prometheus.UnaryServerInterceptor(ctx, req, info, handler) } func handleSignals(signals chan os.Signal, server *grpc.Server) error { diff --git a/services/metrics/metrics.go b/services/metrics/metrics.go new file mode 100644 index 0000000..f87c86b --- /dev/null +++ b/services/metrics/metrics.go @@ -0,0 +1,26 @@ +package metrics + +import ( + "github.com/docker/containerd/plugin" + grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" + "google.golang.org/grpc" +) + +func init() { + plugin.Register("metrics-grpc", &plugin.Registration{ + Type: plugin.GRPCPlugin, + Init: New, + }) +} + +func New(_ *plugin.InitContext) (interface{}, error) { + return &Service{}, nil +} + +type Service struct { +} + +func (s *Service) Register(server *grpc.Server) error { + grpc_prometheus.Register(server) + return nil +} diff --git a/vendor.conf b/vendor.conf index 1054185..2a4bb2d 100644 --- a/vendor.conf +++ b/vendor.conf @@ -30,3 +30,4 @@ github.com/opencontainers/image-spec a431dbcf6a74fca2e0e040b819a836dbe3fb23ca github.com/stevvooe/continuity 1530f13d23b34e2ccaf33881fefecc7e28e3577b golang.org/x/sync 450f422ab23cf9881c94e2db30cac0eb1b7cf80c github.com/BurntSushi/toml v0.2.0-21-g9906417 +github.com/grpc-ecosystem/go-grpc-prometheus 6b7015e65d366bf3f19b2b2a000a831940f0f7e0 diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/LICENSE b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/LICENSE new file mode 100644 index 0000000..b2b0650 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/README.md b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/README.md new file mode 100644 index 0000000..68c8c46 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/README.md @@ -0,0 +1,245 @@ +# Go gRPC Interceptors for Prometheus monitoring + +[![Travis Build](https://travis-ci.org/grpc-ecosystem/go-grpc-prometheus.svg)](https://travis-ci.org/grpc-ecosystem/go-grpc-prometheus) +[![Go Report Card](https://goreportcard.com/badge/github.com/grpc-ecosystem/go-grpc-prometheus)](http://goreportcard.com/report/grpc-ecosystem/go-grpc-prometheus) +[![GoDoc](http://img.shields.io/badge/GoDoc-Reference-blue.svg)](https://godoc.org/github.com/grpc-ecosystem/go-grpc-prometheus) +[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) + +[Prometheus](https://prometheus.io/) monitoring for your [gRPC Go](https://github.com/grpc/grpc-go) servers and clients. + +A sister implementation for [gRPC Java](https://github.com/grpc/grpc-java) (same metrics, same semantics) is in [grpc-ecosystem/java-grpc-prometheus](https://github.com/grpc-ecosystem/java-grpc-prometheus). + +## Interceptors + +[gRPC Go](https://github.com/grpc/grpc-go) recently acquired support for Interceptors, i.e. middleware that is executed +by a gRPC Server before the request is passed onto the user's application logic. It is a perfect way to implement +common patterns: auth, logging and... monitoring. + +To use Interceptors in chains, please see [`go-grpc-middleware`](https://github.com/mwitkow/go-grpc-middleware). + +## Usage + +There are two types of interceptors: client-side and server-side. This package provides monitoring Interceptors for both. + +### Server-side + +```go +import "github.com/grpc-ecosystem/go-grpc-prometheus" +... + // Initialize your gRPC server's interceptor. + myServer := grpc.NewServer( + grpc.StreamInterceptor(grpc_prometheus.StreamServerInterceptor), + grpc.UnaryInterceptor(grpc_prometheus.UnaryServerInterceptor), + ) + // Register your gRPC service implementations. + myservice.RegisterMyServiceServer(s.server, &myServiceImpl{}) + // After all your registrations, make sure all of the Prometheus metrics are initialized. + grpc_prometheus.Register(myServer) + // Register Prometheus metrics handler. + http.Handle("/metrics", prometheus.Handler()) +... +``` + +### Client-side + +```go +import "github.com/grpc-ecosystem/go-grpc-prometheus" +... + clientConn, err = grpc.Dial( + address, + grpc.WithUnaryInterceptor(UnaryClientInterceptor), + grpc.WithStreamInterceptor(StreamClientInterceptor) + ) + client = pb_testproto.NewTestServiceClient(clientConn) + resp, err := client.PingEmpty(s.ctx, &myservice.Request{Msg: "hello"}) +... +``` + +# Metrics + +## Labels + +All server-side metrics start with `grpc_server` as Prometheus subsystem name. All client-side metrics start with `grpc_client`. Both of them have mirror-concepts. Similarly all methods +contain the same rich labels: + + * `grpc_service` - the [gRPC service](http://www.grpc.io/docs/#defining-a-service) name, which is the combination of protobuf `package` and + the `grpc_service` section name. E.g. for `package = mwitkow.testproto` and + `service TestService` the label will be `grpc_service="mwitkow.testproto.TestService"` + * `grpc_method` - the name of the method called on the gRPC service. E.g. + `grpc_method="Ping"` + * `grpc_type` - the gRPC [type of request](http://www.grpc.io/docs/guides/concepts.html#rpc-life-cycle). + Differentiating between the two is important especially for latency measurements. + + - `unary` is single request, single response RPC + - `client_stream` is a multi-request, single response RPC + - `server_stream` is a single request, multi-response RPC + - `bidi_stream` is a multi-request, multi-response RPC + + +Additionally for completed RPCs, the following labels are used: + + * `grpc_code` - the human-readable [gRPC status code](https://github.com/grpc/grpc-go/blob/master/codes/codes.go). + The list of all statuses is to long, but here are some common ones: + + - `OK` - means the RPC was successful + - `IllegalArgument` - RPC contained bad values + - `Internal` - server-side error not disclosed to the clients + +## Counters + +The counters and their up to date documentation is in [server_reporter.go](server_reporter.go) and [client_reporter.go](client_reporter.go) +the respective Prometheus handler (usually `/metrics`). + +For the purpose of this documentation we will only discuss `grpc_server` metrics. The `grpc_client` ones contain mirror concepts. + +For simplicity, let's assume we're tracking a single server-side RPC call of [`mwitkow.testproto.TestService`](examples/testproto/test.proto), +calling the method `PingList`. The call succeeds and returns 20 messages in the stream. + +First, immediately after the server receives the call it will increment the +`grpc_server_started_total` and start the handling time clock (if histograms are enabled). + +```jsoniq +grpc_server_started_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 +``` + +Then the user logic gets invoked. It receives one message from the client containing the request +(it's a `server_stream`): + +```jsoniq +grpc_server_msg_received_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 +``` + +The user logic may return an error, or send multiple messages back to the client. In this case, on +each of the 20 messages sent back, a counter will be incremented: + +```jsoniq +grpc_server_msg_sent_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 20 +``` + +After the call completes, it's status (`OK` or other [gRPC status code](https://github.com/grpc/grpc-go/blob/master/codes/codes.go)) +and the relevant call labels increment the `grpc_server_handled_total` counter. + +```jsoniq +grpc_server_handled_total{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 +``` + +## Histograms + +[Prometheus histograms](https://prometheus.io/docs/concepts/metric_types/#histogram) are a great way +to measure latency distributions of your RPCs. However since it is bad practice to have metrics +of [high cardinality](https://prometheus.io/docs/practices/instrumentation/#do-not-overuse-labels)) +the latency monitoring metrics are disabled by default. To enable them please call the following +in your server initialization code: + +```jsoniq +grpc_prometheus.EnableHandlingTimeHistogram() +``` + +After the call completes, it's handling time will be recorded in a [Prometheus histogram](https://prometheus.io/docs/concepts/metric_types/#histogram) +variable `grpc_server_handling_seconds`. It contains three sub-metrics: + + * `grpc_server_handling_seconds_count` - the count of all completed RPCs by status and method + * `grpc_server_handling_seconds_sum` - cumulative time of RPCs by status and method, useful for + calculating average handling times + * `grpc_server_handling_seconds_bucket` - contains the counts of RPCs by status and method in respective + handling-time buckets. These buckets can be used by Prometheus to estimate SLAs (see [here](https://prometheus.io/docs/practices/histograms/)) + +The counter values will look as follows: + +```jsoniq +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.005"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.01"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.025"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.05"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.1"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.25"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.5"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="1"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="2.5"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="5"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="10"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="+Inf"} 1 +grpc_server_handling_seconds_sum{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 0.0003866430000000001 +grpc_server_handling_seconds_count{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 +``` + + +## Useful query examples + +Prometheus philosophy is to provide the most detailed metrics possible to the monitoring system, and +let the aggregations be handled there. The verbosity of above metrics make it possible to have that +flexibility. Here's a couple of useful monitoring queries: + + +### request inbound rate +```jsoniq +sum(rate(grpc_server_started_total{job="foo"}[1m])) by (grpc_service) +``` +For `job="foo"` (common label to differentiate between Prometheus monitoring targets), calculate the +rate of requests per second (1 minute window) for each gRPC `grpc_service` that the job has. Please note +how the `grpc_method` is being omitted here: all methods of a given gRPC service will be summed together. + +### unary request error rate +```jsoniq +sum(rate(grpc_server_handled_total{job="foo",grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service) +``` +For `job="foo"`, calculate the per-`grpc_service` rate of `unary` (1:1) RPCs that failed, i.e. the +ones that didn't finish with `OK` code. + +### unary request error percentage +```jsoniq +sum(rate(grpc_server_handled_total{job="foo",grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service) + / +sum(rate(grpc_server_started_total{job="foo",grpc_type="unary"}[1m])) by (grpc_service) + * 100.0 +``` +For `job="foo"`, calculate the percentage of failed requests by service. It's easy to notice that +this is a combination of the two above examples. This is an example of a query you would like to +[alert on](https://prometheus.io/docs/alerting/rules/) in your system for SLA violations, e.g. +"no more than 1% requests should fail". + +### average response stream size +```jsoniq +sum(rate(grpc_server_msg_sent_total{job="foo",grpc_type="server_stream"}[10m])) by (grpc_service) + / +sum(rate(grpc_server_started_total{job="foo",grpc_type="server_stream"}[10m])) by (grpc_service) +``` +For `job="foo"` what is the `grpc_service`-wide `10m` average of messages returned for all ` +server_stream` RPCs. This allows you to track the stream sizes returned by your system, e.g. allows +you to track when clients started to send "wide" queries that ret +Note the divisor is the number of started RPCs, in order to account for in-flight requests. + +### 99%-tile latency of unary requests +```jsoniq +histogram_quantile(0.99, + sum(rate(grpc_server_handling_seconds_bucket{job="foo",grpc_type="unary"}[5m])) by (grpc_service,le) +) +``` +For `job="foo"`, returns an 99%-tile [quantile estimation](https://prometheus.io/docs/practices/histograms/#quantiles) +of the handling time of RPCs per service. Please note the `5m` rate, this means that the quantile +estimation will take samples in a rolling `5m` window. When combined with other quantiles +(e.g. 50%, 90%), this query gives you tremendous insight into the responsiveness of your system +(e.g. impact of caching). + +### percentage of slow unary queries (>250ms) +```jsoniq +100.0 - ( +sum(rate(grpc_server_handling_seconds_bucket{job="foo",grpc_type="unary",le="0.25"}[5m])) by (grpc_service) + / +sum(rate(grpc_server_handling_seconds_count{job="foo",grpc_type="unary"}[5m])) by (grpc_service) +) * 100.0 +``` +For `job="foo"` calculate the by-`grpc_service` fraction of slow requests that took longer than `0.25` +seconds. This query is relatively complex, since the Prometheus aggregations use `le` (less or equal) +buckets, meaning that counting "fast" requests fractions is easier. However, simple maths helps. +This is an example of a query you would like to alert on in your system for SLA violations, +e.g. "less than 1% of requests are slower than 250ms". + + +## Status + +This code has been used since August 2015 as the basis for monitoring of *production* gRPC micro services at [Improbable](https://improbable.io). + +## License + +`go-grpc-prometheus` is released under the Apache 2.0 license. See the [LICENSE](LICENSE) file for details. diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client.go new file mode 100644 index 0000000..d9e87b2 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client.go @@ -0,0 +1,72 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +// gRPC Prometheus monitoring interceptors for client-side gRPC. + +package grpc_prometheus + +import ( + "io" + + "golang.org/x/net/context" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" +) + +// UnaryClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Unary RPCs. +func UnaryClientInterceptor(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error { + monitor := newClientReporter(Unary, method) + monitor.SentMessage() + err := invoker(ctx, method, req, reply, cc, opts...) + if err != nil { + monitor.ReceivedMessage() + } + monitor.Handled(grpc.Code(err)) + return err +} + +// StreamServerInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Streaming RPCs. +func StreamClientInterceptor(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) { + monitor := newClientReporter(clientStreamType(desc), method) + clientStream, err := streamer(ctx, desc, cc, method, opts...) + if err != nil { + monitor.Handled(grpc.Code(err)) + return nil, err + } + return &monitoredClientStream{clientStream, monitor}, nil +} + +func clientStreamType(desc *grpc.StreamDesc) grpcType { + if desc.ClientStreams && !desc.ServerStreams { + return ClientStream + } else if !desc.ClientStreams && desc.ServerStreams { + return ServerStream + } + return BidiStream +} + +// monitoredClientStream wraps grpc.ClientStream allowing each Sent/Recv of message to increment counters. +type monitoredClientStream struct { + grpc.ClientStream + monitor *clientReporter +} + +func (s *monitoredClientStream) SendMsg(m interface{}) error { + err := s.ClientStream.SendMsg(m) + if err == nil { + s.monitor.SentMessage() + } + return err +} + +func (s *monitoredClientStream) RecvMsg(m interface{}) error { + err := s.ClientStream.RecvMsg(m) + if err == nil { + s.monitor.ReceivedMessage() + } else if err == io.EOF { + s.monitor.Handled(codes.OK) + } else { + s.monitor.Handled(grpc.Code(err)) + } + return err +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_reporter.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_reporter.go new file mode 100644 index 0000000..16b7615 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_reporter.go @@ -0,0 +1,111 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +package grpc_prometheus + +import ( + "time" + + "google.golang.org/grpc/codes" + + prom "github.com/prometheus/client_golang/prometheus" +) + +var ( + clientStartedCounter = prom.NewCounterVec( + prom.CounterOpts{ + Namespace: "grpc", + Subsystem: "client", + Name: "started_total", + Help: "Total number of RPCs started on the client.", + }, []string{"grpc_type", "grpc_service", "grpc_method"}) + + clientHandledCounter = prom.NewCounterVec( + prom.CounterOpts{ + Namespace: "grpc", + Subsystem: "client", + Name: "handled_total", + Help: "Total number of RPCs completed by the client, regardless of success or failure.", + }, []string{"grpc_type", "grpc_service", "grpc_method", "grpc_code"}) + + clientStreamMsgReceived = prom.NewCounterVec( + prom.CounterOpts{ + Namespace: "grpc", + Subsystem: "client", + Name: "msg_received_total", + Help: "Total number of RPC stream messages received by the client.", + }, []string{"grpc_type", "grpc_service", "grpc_method"}) + + clientStreamMsgSent = prom.NewCounterVec( + prom.CounterOpts{ + Namespace: "grpc", + Subsystem: "client", + Name: "msg_sent_total", + Help: "Total number of gRPC stream messages sent by the client.", + }, []string{"grpc_type", "grpc_service", "grpc_method"}) + + clientHandledHistogramEnabled = false + clientHandledHistogramOpts = prom.HistogramOpts{ + Namespace: "grpc", + Subsystem: "client", + Name: "handling_seconds", + Help: "Histogram of response latency (seconds) of the gRPC until it is finished by the application.", + Buckets: prom.DefBuckets, + } + clientHandledHistogram *prom.HistogramVec +) + +func init() { + prom.MustRegister(clientStartedCounter) + prom.MustRegister(clientHandledCounter) + prom.MustRegister(clientStreamMsgReceived) + prom.MustRegister(clientStreamMsgSent) +} + +// EnableClientHandlingTimeHistogram turns on recording of handling time of RPCs. +// Histogram metrics can be very expensive for Prometheus to retain and query. +func EnableClientHandlingTimeHistogram(opts ...HistogramOption) { + for _, o := range opts { + o(&clientHandledHistogramOpts) + } + if !clientHandledHistogramEnabled { + clientHandledHistogram = prom.NewHistogramVec( + clientHandledHistogramOpts, + []string{"grpc_type", "grpc_service", "grpc_method"}, + ) + prom.Register(clientHandledHistogram) + } + clientHandledHistogramEnabled = true +} + +type clientReporter struct { + rpcType grpcType + serviceName string + methodName string + startTime time.Time +} + +func newClientReporter(rpcType grpcType, fullMethod string) *clientReporter { + r := &clientReporter{rpcType: rpcType} + if clientHandledHistogramEnabled { + r.startTime = time.Now() + } + r.serviceName, r.methodName = splitMethodName(fullMethod) + clientStartedCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() + return r +} + +func (r *clientReporter) ReceivedMessage() { + clientStreamMsgReceived.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() +} + +func (r *clientReporter) SentMessage() { + clientStreamMsgSent.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() +} + +func (r *clientReporter) Handled(code codes.Code) { + clientHandledCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName, code.String()).Inc() + if clientHandledHistogramEnabled { + clientHandledHistogram.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Observe(time.Since(r.startTime).Seconds()) + } +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server.go new file mode 100644 index 0000000..f85c8c2 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server.go @@ -0,0 +1,74 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +// gRPC Prometheus monitoring interceptors for server-side gRPC. + +package grpc_prometheus + +import ( + "golang.org/x/net/context" + "google.golang.org/grpc" +) + +// PreregisterServices takes a gRPC server and pre-initializes all counters to 0. +// This allows for easier monitoring in Prometheus (no missing metrics), and should be called *after* all services have +// been registered with the server. +func Register(server *grpc.Server) { + serviceInfo := server.GetServiceInfo() + for serviceName, info := range serviceInfo { + for _, mInfo := range info.Methods { + preRegisterMethod(serviceName, &mInfo) + } + } +} + +// UnaryServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Unary RPCs. +func UnaryServerInterceptor(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { + monitor := newServerReporter(Unary, info.FullMethod) + monitor.ReceivedMessage() + resp, err := handler(ctx, req) + monitor.Handled(grpc.Code(err)) + if err == nil { + monitor.SentMessage() + } + return resp, err +} + +// StreamServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Streaming RPCs. +func StreamServerInterceptor(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { + monitor := newServerReporter(streamRpcType(info), info.FullMethod) + err := handler(srv, &monitoredServerStream{ss, monitor}) + monitor.Handled(grpc.Code(err)) + return err +} + +func streamRpcType(info *grpc.StreamServerInfo) grpcType { + if info.IsClientStream && !info.IsServerStream { + return ClientStream + } else if !info.IsClientStream && info.IsServerStream { + return ServerStream + } + return BidiStream +} + +// monitoredStream wraps grpc.ServerStream allowing each Sent/Recv of message to increment counters. +type monitoredServerStream struct { + grpc.ServerStream + monitor *serverReporter +} + +func (s *monitoredServerStream) SendMsg(m interface{}) error { + err := s.ServerStream.SendMsg(m) + if err == nil { + s.monitor.SentMessage() + } + return err +} + +func (s *monitoredServerStream) RecvMsg(m interface{}) error { + err := s.ServerStream.RecvMsg(m) + if err == nil { + s.monitor.ReceivedMessage() + } + return err +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_reporter.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_reporter.go new file mode 100644 index 0000000..628a890 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_reporter.go @@ -0,0 +1,157 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +package grpc_prometheus + +import ( + "time" + + "google.golang.org/grpc/codes" + + prom "github.com/prometheus/client_golang/prometheus" + "google.golang.org/grpc" +) + +type grpcType string + +const ( + Unary grpcType = "unary" + ClientStream grpcType = "client_stream" + ServerStream grpcType = "server_stream" + BidiStream grpcType = "bidi_stream" +) + +var ( + serverStartedCounter = prom.NewCounterVec( + prom.CounterOpts{ + Namespace: "grpc", + Subsystem: "server", + Name: "started_total", + Help: "Total number of RPCs started on the server.", + }, []string{"grpc_type", "grpc_service", "grpc_method"}) + + serverHandledCounter = prom.NewCounterVec( + prom.CounterOpts{ + Namespace: "grpc", + Subsystem: "server", + Name: "handled_total", + Help: "Total number of RPCs completed on the server, regardless of success or failure.", + }, []string{"grpc_type", "grpc_service", "grpc_method", "grpc_code"}) + + serverStreamMsgReceived = prom.NewCounterVec( + prom.CounterOpts{ + Namespace: "grpc", + Subsystem: "server", + Name: "msg_received_total", + Help: "Total number of RPC stream messages received on the server.", + }, []string{"grpc_type", "grpc_service", "grpc_method"}) + + serverStreamMsgSent = prom.NewCounterVec( + prom.CounterOpts{ + Namespace: "grpc", + Subsystem: "server", + Name: "msg_sent_total", + Help: "Total number of gRPC stream messages sent by the server.", + }, []string{"grpc_type", "grpc_service", "grpc_method"}) + + serverHandledHistogramEnabled = false + serverHandledHistogramOpts = prom.HistogramOpts{ + Namespace: "grpc", + Subsystem: "server", + Name: "handling_seconds", + Help: "Histogram of response latency (seconds) of gRPC that had been application-level handled by the server.", + Buckets: prom.DefBuckets, + } + serverHandledHistogram *prom.HistogramVec +) + +func init() { + prom.MustRegister(serverStartedCounter) + prom.MustRegister(serverHandledCounter) + prom.MustRegister(serverStreamMsgReceived) + prom.MustRegister(serverStreamMsgSent) +} + +type HistogramOption func(*prom.HistogramOpts) + +// WithHistogramBuckets allows you to specify custom bucket ranges for histograms if EnableHandlingTimeHistogram is on. +func WithHistogramBuckets(buckets []float64) HistogramOption { + return func(o *prom.HistogramOpts) { o.Buckets = buckets } +} + +// EnableHandlingTimeHistogram turns on recording of handling time of RPCs for server-side interceptors. +// Histogram metrics can be very expensive for Prometheus to retain and query. +func EnableHandlingTimeHistogram(opts ...HistogramOption) { + for _, o := range opts { + o(&serverHandledHistogramOpts) + } + if !serverHandledHistogramEnabled { + serverHandledHistogram = prom.NewHistogramVec( + serverHandledHistogramOpts, + []string{"grpc_type", "grpc_service", "grpc_method"}, + ) + prom.Register(serverHandledHistogram) + } + serverHandledHistogramEnabled = true +} + +type serverReporter struct { + rpcType grpcType + serviceName string + methodName string + startTime time.Time +} + +func newServerReporter(rpcType grpcType, fullMethod string) *serverReporter { + r := &serverReporter{rpcType: rpcType} + if serverHandledHistogramEnabled { + r.startTime = time.Now() + } + r.serviceName, r.methodName = splitMethodName(fullMethod) + serverStartedCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() + return r +} + +func (r *serverReporter) ReceivedMessage() { + serverStreamMsgReceived.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() +} + +func (r *serverReporter) SentMessage() { + serverStreamMsgSent.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() +} + +func (r *serverReporter) Handled(code codes.Code) { + serverHandledCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName, code.String()).Inc() + if serverHandledHistogramEnabled { + serverHandledHistogram.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Observe(time.Since(r.startTime).Seconds()) + } +} + +// preRegisterMethod is invoked on Register of a Server, allowing all gRPC services labels to be pre-populated. +func preRegisterMethod(serviceName string, mInfo *grpc.MethodInfo) { + methodName := mInfo.Name + methodType := string(typeFromMethodInfo(mInfo)) + // These are just references (no increments), as just referencing will create the labels but not set values. + serverStartedCounter.GetMetricWithLabelValues(methodType, serviceName, methodName) + serverStreamMsgReceived.GetMetricWithLabelValues(methodType, serviceName, methodName) + serverStreamMsgSent.GetMetricWithLabelValues(methodType, serviceName, methodName) + if serverHandledHistogramEnabled { + serverHandledHistogram.GetMetricWithLabelValues(methodType, serviceName, methodName) + } + for _, code := range allCodes { + serverHandledCounter.GetMetricWithLabelValues(methodType, serviceName, methodName, code.String()) + } +} + +func typeFromMethodInfo(mInfo *grpc.MethodInfo) grpcType { + if mInfo.IsClientStream == false && mInfo.IsServerStream == false { + return Unary + } + if mInfo.IsClientStream == true && mInfo.IsServerStream == false { + return ClientStream + } + if mInfo.IsClientStream == false && mInfo.IsServerStream == true { + return ServerStream + } + return BidiStream +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/util.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/util.go new file mode 100644 index 0000000..372460a --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/util.go @@ -0,0 +1,27 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +package grpc_prometheus + +import ( + "strings" + + "google.golang.org/grpc/codes" +) + +var ( + allCodes = []codes.Code{ + codes.OK, codes.Canceled, codes.Unknown, codes.InvalidArgument, codes.DeadlineExceeded, codes.NotFound, + codes.AlreadyExists, codes.PermissionDenied, codes.Unauthenticated, codes.ResourceExhausted, + codes.FailedPrecondition, codes.Aborted, codes.OutOfRange, codes.Unimplemented, codes.Internal, + codes.Unavailable, codes.DataLoss, + } +) + +func splitMethodName(fullMethodName string) (string, string) { + fullMethodName = strings.TrimPrefix(fullMethodName, "/") // remove leading slash + if i := strings.Index(fullMethodName, "/"); i >= 0 { + return fullMethodName[:i], fullMethodName[i+1:] + } + return "unknown", "unknown" +}