From 85c0cfb72935ae1e201f3e83c1f245feb75352ff Mon Sep 17 00:00:00 2001 From: Naman Dixit <namandixit@cse.iitb.ac.in> Date: Mon, 9 Mar 2020 21:08:01 +0530 Subject: [PATCH] Updated docker instrumentation data format --- readme.md | 57 ++++++++++++------- resource_system/src/grunt/instrument_docker.c | 51 ++++++++++++----- 2 files changed, 74 insertions(+), 34 deletions(-) diff --git a/readme.md b/readme.md index 7c70c30..6d713c7 100644 --- a/readme.md +++ b/readme.md @@ -11,8 +11,8 @@ Clone using "git clone --recursive https://git.cse.iitb.ac.in/synerg/xanadu" ## Architecture -Xanadu is divided into two extremely loosely coupled modules, the **Dispatch System (DS)** and the **Resource System (RS)** module. The RS looks after -resource provisioning and consolidation at the host level while the DS looks after handling user requests and executing those requests at the requisite +Xanadu is divided into two extremely loosely coupled modules, the **Dispatch System (DS)** and the **Resource System (RS)** module. The RS looks after +resource provisioning and consolidation at the host level while the DS looks after handling user requests and executing those requests at the requisite isolation level using resources provided by the RS. A loose architecture diagram of Xanadu is given below.  @@ -35,7 +35,7 @@ Format: { "resource_id": "unique-transaction-id", "timestamp" : "time(2) compatible timestamp", - "grunts": [ + "grunts": [ { node_id: some unique ID, port: port address}, ... ] // List of machine IDs } @@ -44,21 +44,40 @@ Format: Once the runtime entity has been launched (or the launch has failed), the Executor sends back a status message on the `LOG_COMMON` topic. ```javascript { - "message_type" : "deployment_launch", + "message_type" : "deployment", + "reason": "launch"/"terminate", "node_id" : "unique-machine-id", "entity_id" : "handle for the actual container/VM/etc.", "entity_type" : "docker/libvirt/etc.", "resource_id": "logical-entity-id", "function_id": "unique-function-id", "timestamp" : "time(2) compatible timestamp", - "reason": "deployment"/"termination" - "status": true/false // Only valid if reason==deployment } ``` -Instrumentation data is also sent on the `LOG_COMMON` topic. This data is sent from whichever part of the pipeline has access to the relevant information, +Instrumentation data is also sent on the `LOG_COMMON` topic. This data is sent from whichever part of the pipeline has access to the relevant information, and whoever needs the data is allowed to read it. Each message is required to have atleast three fields: `node_id`, `resource_id` and `function_id`. ```javascript +{ // From Docker + "message_type" : "instrumentation", + "node_id" : "unique-machine-id", + "resource_id": "logical-entity-id", + "function_id": "unique-function-id", + "entity_id" : "handle for the actual container/VM/etc.", + "entity_type" : "docker/libvirt/etc.", + "timestamp" : "time(2) compatible timestamp", + "data" : { + "cpu_percentage" : 0.43, + "memory_percentage" : 4.32, + "memory_used" : "42MB", + "memory_usable" : "543MB", + "disk_read" : "4GB", + "disk_written" : "43GB", + "net_upload" : "32MB", + "net_download" : "54MB" + } +} + { // Example message from Executor "message_type" : "instrumentation", "node_id" : "unique-machine-id", @@ -85,13 +104,13 @@ and whoever needs the data is allowed to read it. Each message is required to ha "resource_id": "logical-entity-id", "function_id": "unique-function-id", "timestamp" : "time(2) compatible timestamp", - "coldstart_time" + "coldstart_time" } ``` ## Dispatch System (DS) -The DS is divided into two submodules the **Dispatch Manager** and the **Dispatch Daemon**. The Dispatch Manager runs on the Master node while the Dispatch Daemon -runs on each Worker nodes. When a request arrives at the Dispatch Manager, it queries the RM for resources and on receiving the resource requests the Dispatch Daemon +The DS is divided into two submodules the **Dispatch Manager** and the **Dispatch Daemon**. The Dispatch Manager runs on the Master node while the Dispatch Daemon +runs on each Worker nodes. When a request arrives at the Dispatch Manager, it queries the RM for resources and on receiving the resource requests the Dispatch Daemon to run and execute the function on the specified worker node. ### Directory Structure @@ -140,12 +159,12 @@ to run and execute the function on the specified worker node. #### Dispatch Manager (DM) -Internally DM uses Apache Kafka for interaction between the Dispatch Manager (DM) and the Dispatch Agents, while the messages are in JSON format. +Internally DM uses Apache Kafka for interaction between the Dispatch Manager (DM) and the Dispatch Agents, while the messages are in JSON format. -Every Dispatch Agent listens on a topic which is its own UID (Currently the primary IP Address), the Dispatch Manager listens on the topics *"response"* and +Every Dispatch Agent listens on a topic which is its own UID (Currently the primary IP Address), the Dispatch Manager listens on the topics *"response"* and *"heartbeat"*. -- **Request Message:** When a request is received at the Dispatch Manager, it directs the Dispatch Agent to start a worker environment. A message is sent via the +- **Request Message:** When a request is received at the Dispatch Manager, it directs the Dispatch Agent to start a worker environment. A message is sent via the - chose Worker's ID topic. \ Format: @@ -223,7 +242,7 @@ Upon being launched, each Resource Daemon (RD) sends a JOIN message to the RM on } ``` -After this, RDs send a heartbeat message to the RM periodically on topic `HEARTBEAT_RD_2_RM`. These messages contain the current state of all the +After this, RDs send a heartbeat message to the RM periodically on topic `HEARTBEAT_RD_2_RM`. These messages contain the current state of all the resources being tracked by RDs on each machine. This data is cached by the RM. ```javascript @@ -235,7 +254,7 @@ resources being tracked by RDs on each machine. This data is cached by the RM. } ``` -If the RM recieves a heartbeat from an RD which has not joined before (due to either the RM dying and some older messages being stuck in Kafka), +If the RM recieves a heartbeat from an RD which has not joined before (due to either the RM dying and some older messages being stuck in Kafka), it sends a rejoin command to the RD on topic `REJOIN_RM_2_RD`. ```javascript { @@ -244,10 +263,10 @@ it sends a rejoin command to the RD on topic `REJOIN_RM_2_RD`. } ``` -Also, if the RM doesn't recieve heartbeats from some RD for some amount of time, it assumes that the RD is dead and frees its resources. +Also, if the RM doesn't recieve heartbeats from some RD for some amount of time, it assumes that the RD is dead and frees its resources. If the RD sends a beat after this, the rejoin message is sent. -The RM, upon recieving the request from the DM, checks its local cache to find a suitable machine. If it finds some, it sends a message back to the +The RM, upon recieving the request from the DM, checks its local cache to find a suitable machine. If it finds some, it sends a message back to the DM on topic `RESPONSE_RM_2_DM`. ```javascript { @@ -258,7 +277,7 @@ DM on topic `RESPONSE_RM_2_DM`. } ``` -If, on the other hand, the RM can't find any such machine in its cache, it sends a message to all the RDs requesting their current status. This message +If, on the other hand, the RM can't find any such machine in its cache, it sends a message to all the RDs requesting their current status. This message is posted on the topic `REQUEST_RM_2_RD`. Format: ```javascript @@ -280,5 +299,5 @@ The RDs recieve this message and send back whether on not they satisfy the const } ``` -The RM waits for a certain amount of time for the RDs; then, it sends a list of however many RDs have replied affirmatively to the DM on topic +The RM waits for a certain amount of time for the RDs; then, it sends a list of however many RDs have replied affirmatively to the DM on topic `RESPONSE_RM_2_DM`, as described above. diff --git a/resource_system/src/grunt/instrument_docker.c b/resource_system/src/grunt/instrument_docker.c index bd76d4f..f74df6a 100644 --- a/resource_system/src/grunt/instrument_docker.c +++ b/resource_system/src/grunt/instrument_docker.c @@ -30,28 +30,49 @@ void* dockerProcessLoop (void *arg) atof(cJSON_GetObjectItem(data_json, "CPUPerc")->valuestring)); sbufPrint(json, ",\n\"memory_percentage\": %f", atof(cJSON_GetObjectItem(data_json, "MemPerc")->valuestring)); - sbufPrint(json, ",\n\"memory_used\": %f", - atof(cJSON_GetObjectItem(data_json, "MemUsage")->valuestring)); - sbufPrint(json, ",\n\"memory_limit\": %f", - atof(strchr(cJSON_GetObjectItem(data_json, "MemUsage")->valuestring, '/') + 1)); - sbufPrint(json, ",\n\"block_input\": %f", - atof(cJSON_GetObjectItem(data_json, "BlockIO")->valuestring)); - sbufPrint(json, ",\n\"block_output\": %f", - atof(strchr(cJSON_GetObjectItem(data_json, "BlockIO")->valuestring, '/') + 1)); - sbufPrint(json, ",\n\"net_down\": %f", - atof(cJSON_GetObjectItem(data_json, "NetIO")->valuestring)); - sbufPrint(json, ",\n\"net_up\": %f", - atof(strchr(cJSON_GetObjectItem(data_json, "NetIO")->valuestring, '/') + 1)); + + { // Memory + Char *mem_begin = cJSON_GetObjectItem(data_json, "MemUsage")->valuestring; + Char *mem_middle = strchr(mem_begin, '/'); + mem_middle[0] = '\0'; + mem_middle[-1] = '\0'; + mem_middle[1] = '\0'; + sbufPrint(json, ",\n\"memory_used\": \"%s\"", mem_begin); + sbufPrint(json, ",\n\"memory_usable\": \"%s\"", mem_middle + 2); + } + + { // Disk + Char *block_begin = cJSON_GetObjectItem(data_json, "BlockIO")->valuestring; + Char *block_middle = strchr(block_begin, '/'); + block_middle[0] = '\0'; + block_middle[-1] = '\0'; + block_middle[1] = '\0'; + sbufPrint(json, ",\n\"disk_read\": \"%s\"", block_begin); + sbufPrint(json, ",\n\"disk_written\": \"%s\"", block_middle + 2); + } + + { // Network + Char *net_begin = cJSON_GetObjectItem(data_json, "NetIO")->valuestring; + Char *net_middle = strchr(net_begin, '/'); + net_middle[0] = '\0'; + net_middle[-1] = '\0'; + net_middle[1] = '\0'; + sbufPrint(json, ",\n\"net_upload\": \"%s\"", net_begin); + sbufPrint(json, ",\n\"net_download\": \"%s\"", net_middle + 2); + } + sbufPrint(json, "\n}\n"); cJSON *json_parse = cJSON_Parse(json); Char *json_pretty = cJSON_Print(json_parse); + Sint time_now = (Sint)time(0); Char *output = NULL; - sbufPrint(output, "{\"node_id\": \"%s\"", node_name); + sbufPrint(output, "{\n\"message_type\": \"%s\"", "instrumentation"); + sbufPrint(output, ",\"node_id\": \"%s\"", node_name); sbufPrint(output, ",\n\"entity_id\": \"%s\"", (Char*)arg); - sbufPrint(output, ",\n\"type\": \"%s\"", "instrumentation"); - sbufPrint(output, ",\n\"backend\": \"%s\"", "docker"); + sbufPrint(output, ",\n\"entity_type\": \"%s\"", "docker"); + sbufPrint(output, ",\n\"timestamp\": %d", time_now); sbufPrint(output, ",\n\"data\": %s", json_pretty); sbufPrint(output, "\n}\n"); -- 2.24.1