Skip to content

DataKit Master Configuration


The DataKit master configuration is used to configure the running behavior of the DataKit itself.

Its directory is generally located in:

  • Linux/Mac: /usr/local/datakit/conf.d/datakit.conf
  • Windows: C:\Program Files\datakit\conf.d\datakit.conf

When DaemonSet is installed, the DataKit does not actually load the configuration, although this file exists in the corresponding directory. These matches are generated by injecting environment variables. For all of the following configurations, you can find the corresponding environment variable configuration in the Kubernetes deployment documentation.

Datakit Main Configure Sample

Datakit main configure is datakit.conf, here is the example sample(1.65.1):

datakit.conf
################################################
# Global configures
################################################
# Default enabled input list.
default_enabled_inputs = [
  "cpu",
  "disk",
  "diskio",
  "host_processes",
  "hostobject",
  "mem",
  "net",
  "swap",
  "system",
]

# enable_pprof: bool
# If pprof enabled, we can profiling the running datakit
enable_pprof = true
pprof_listen = "localhost:6060" # pprof listen

# protect_mode: bool, default false
# When protect_mode eanbled, we can set radical collect parameters, these may cause Datakit
# collect data more frequently.
protect_mode = true

# The user name running datakit. Generally for audit purpose. Default is root.
datakit_user = "root"

################################################
# ulimit: set max open-files limit(Linux only)
################################################
ulimit = 64000

################################################
# point_pool: use point pool for better memory usage
################################################
[point_pool]
  enable = false
  reserved_capacity = 4096

################################################
# DCA configure
################################################
[dca]
  # Enable or disable DCA
  enable = false

  # DCA websocket server address
  websocket_server = "ws://localhost:8000/ws"

################################################
# Upgrader 
################################################
[dk_upgrader]
  # host address
  host = "0.0.0.0"

  # port number
  port = 9542 

################################################
# Pipeline
################################################
[pipeline]
  # IP database type, support iploc and geolite2
  ipdb_type = "iploc"

  # How often to sync remote pipeline
  remote_pull_interval = "1m"

  #
  # reftab configures
  #
  # Reftab remote HTTP URL(https/http)
  refer_table_url = ""

  # How often reftab sync the remote
  refer_table_pull_interval = "5m"

  # use sqlite to store reftab data to release memory usage
  use_sqlite = false
  # or use pure memory to cache the reftab data
  sqlite_mem_mode = false

  # append run info
  disable_append_run_info = false

  # default pipeline
  [pipeline.default_pipeline]
    # logging = "<your_script.p>"
    # metric  = "<your_script.p>"
    # tracing = "<your_script.p>"

  # Offload data processing tasks to post-level data processors.
  [pipeline.offload]
    receiver = "datakit-http"
    addresses = [
      # "http://<ip>:<port>"
    ]

################################################
# HTTP server(9529)
################################################
[http_api]

  # HTTP server address
  listen = "localhost:9529"

  # Disable 404 page to hide detailed Datakit info
  disable_404page = false

  # only enable these APIs. If list empty, all APIs are enabled.
  public_apis = []

  # Datakit server-side timeout
  timeout = "30s"
  close_idle_connection = false

  # API rate limit(QPS)
  request_rate_limit = 20.0

  #
  # RUM related: we should port these configures to RUM inputs(TODO)
  #
  # When serving RUM(/v1/write/rum), extract the IP address from this HTTP header
  rum_origin_ip_header = "X-Forwarded-For"
  # When serving RUM(/v1/write/rum), only accept requests from these app-id.
  # If the list empty, all app's requests accepted.
  rum_app_id_white_list = []

  # only these domains enable CORS. If list empty, all domains are enabled.
  allowed_cors_origins = []

  # Start Datakit web server with HTTPS
  [http_api.tls]
    # cert = "path/to/certificate/file"
    # privkey = "path/to/private_key/file"

################################################
# io configures
################################################
[io]
  # How often Datakit flush data to dataway.
  # Datakit will upload data points if cached(in memory) points
  #  reached(>=) the max_cache_count or the flush_interval triggered.
  max_cache_count = 1000
  flush_workers   = 0 # default to (cpu_core * 2)
  flush_interval  = "10s"

  # Queue size of feed.
  feed_chan_size = 1

  # Set blocking if queue is full.
  # NOTE: Global blocking mode may consume more memory on large metric points.
  global_blocking = false

  # Data point filter configures.
  # NOTE: Most of the time, you should use web-side filter, it's a debug helper for developers.
  #[io.filters]
  #  logging = [
  #   "{ source = 'datakit' or f1 IN [ 1, 2, 3] }"
  #  ]
  #  metric = [
  #    "{ measurement IN ['datakit', 'disk'] }",
  #    "{ measurement CONTAIN ['host.*', 'swap'] }",
  #  ]
  #  object = [
  #    { class CONTAIN ['host_.*'] }",
  #  ]
  #  tracing = [
  #    "{ service = re("abc.*") AND some_tag CONTAIN ['def_.*'] }",
  #  ]

[recorder]
  enabled = false
  #path = "/path/to/point-data/dir"
  encoding = "v2"  # use protobuf-json format
  duration = "30m" # record for 30 minutes

  # only record these inputs, if empty, record all
  inputs = [
    #"cpu",
    #"mem",
  ]

  # only record these categoris, if empty, record all
  category = [
    #"logging",
    #"object",
  ]

################################################
# Dataway configure
################################################
[dataway]
  # urls: Dataway URL list
  # NOTE: do not configure multiple URLs here, it's a deprecated feature.
  urls = [
    # "https://openway.guance.com?token=<YOUR-WORKSPACE-TOKEN>"
  ]

  # Dataway HTTP timeout
  timeout_v2 = "30s"

  # max_retry_count specifies at most how many times will be tried when dataway API fails(not 4xx),
  # default value(and minimal) is 1 and maximum value is 10.
  #
  # The default set to 1 to makes the API fails ASAP to release memroy.
  max_retry_count = 1

  # The interval between two retry operation, valid time units are "ns", "us", "ms", "s", "m", "h"
  retry_delay = "1s"

  # HTTP Proxy
  # Format: "http(s)://IP:Port"
  http_proxy = ""

  max_idle_conns   = 0       # limit idle TCP connections for HTTP request to Dataway
  enable_httptrace = false   # enable trace HTTP metrics(connection/NDS/TLS and so on)
  idle_timeout     = "90s"   # not-set, default 90s

  # HTTP body content type, other candidates are(case insensitive):
  #  - v1: line-protocol
  #  - v2: protobuf
  content_encoding = "v2"

  # Enable GZip to upload point data.
  #
  # do NOT disable gzip or your get large network payload.
  gzip = true

  max_raw_body_size = 1048576 # max body size(before gizp) in bytes

  # Customer tag or field keys that will extract from exist points
  # to build the X-Global-Tags HTTP header value.
  global_customer_keys = []
  enable_sinker        = false # disable sinker

  # use dataway as NTP server
  [dataway.ntp]
    interval = "5m"  # sync dataway time each 5min

    # if datakit local time and dataway time's ABS value reach the diff,
    # datakit's soft time will update to the dataway time.
    # NOTE: diff MUST larger than "1s"
    diff     = "30s" 

  # WAL queue for uploading points
  [dataway.wal]
    max_capacity_gb = 2.0 # 2GB reserved disk space for each category(M/L/O/T/...)
    #workers = 4          # flush workers on WAL(default to CPU limited cores)
    #mem_cap = 4          # in-memory queue capacity(default to CPU limited cores)
    #fail_cache_clean_interval = "30s" # duration for clean fail uploaded data


################################################
# Datakit logging configure
################################################
[logging]

  # log path
  log = "/var/log/datakit/log"

  # HTTP access log
  gin_log = "/var/log/datakit/gin.log"

  # level level(info/debug)
  level = "info"

  # Disable log color
  disable_color = false

  # log rotate size(in MB)
  # DataKit will always keep at most n+1(n backup log and 1 writing log) splited log files on disk.
  rotate = 32

  # Upper limit count of backup log
  rotate_backups = 5

################################################
# Global tags
################################################
# We will try to add these tags to every collected data point if these
# tags do not exist in orignal data.
#
# NOTE: we can get the real IP of current note, we just need
# to set "$datakit_ip" or "__datakit_ip" here. Same for the hostname.
[global_host_tags]
  ip   = "$datakit_ip"
  host = "$datakit_hostname"

[election]
  # Enable election
  enable = false

  # Election whitelist
  # NOTE: Empty to disable whitelist
  node_whitelist = []

  # Election namespace.
  # NOTE: for single workspace, there can be multiple election namespace.
  namespace = "default"

  # If enabled, every data point will add a tag with election_namespace = <your-election-namespace>
  enable_namespace_tag = false

  # Like global_host_tags, but only for data points that are remotely collected(such as MySQL/Nginx).
  [election.tags]
    #  project = "my-project"
    #  cluster = "my-cluster"

###################################################
# Tricky: we can rename the default hostname here
###################################################
[environments]
  ENV_HOSTNAME = ""

################################################
# resource limit configures
################################################
[resource_limit]

  # enable or disable resource limit
  enable = true

  # Linux only, cgroup path
  path = "/datakit"

  # set max CPU usage(%, max 100.0, no matter how many CPU cores here)
  cpu_max = 30.0

  # set max memory usage(MB)
  mem_max_mb = 4096

################################################
# git_repos configures
################################################

# We can hosting all input configures on git server
[git_repos]
  # git pull interval
  pull_interval = "1m"

  # git repository settings
  [[git_repos.repo]]
    # enable the repository or not
    enable = false

    # the branch name to pull
    branch = "master"

    # git repository URL. There are 3 formats here:
    #   - HTTP(s): such as "https://github.datakit.com/path/to/datakit-conf.git"
    #   - Git: such as "git@github.com:path/to/datakit.git"
    #   - SSH: such as "ssh://git@github.com:9000/path/to/repository.git"
    url = ""

    # For formats Git and SSH, we need extra configures:
    ssh_private_key_path = ""
    ssh_private_key_password = ""

################################################
# crypto key or key filePath.
################################################
[crypto]
  aes_key = ""
  aes_Key_file = ""

[remote_job]
  enable=false
  envs = ["OSS_BUCKET_HOST=host","OSS_ACCESS_KEY_ID=key","OSS_ACCESS_KEY_SECRET=secret","OSS_BUCKET_NAME=bucket"]
  interval = "30s"
  java_home=""

Configuration of HTTP Service

DataKit opens an HTTP service to receive external data or provide basic data services to the outside world.

Modify the HTTP Service Address

The default HTTP service address is localhost:9529, and if port 9529 is occupied, or you want to access the HTTP service of DataKit from outside (for example, you want to receive RUM or Tracing data), you can modify it to:

[http_api]
   listen = "0.0.0.0:<other-port>"
   # or using IPV6 address
   # listen = "[::]:<other-port>"

NOTE: IPv6 need Datakit version 1.5.7.

Using Unix Domain Socket

Datakit supports UNIX domain sockets access. Open it as follows: The listen field is configured to the full path to a file that does not exist. Here, for example, sockcan be any file name.

[http_api]
   listen = "/tmp/datakit.sock"
After the configuration is complete, you can use thecurlcommand to test whether the configuration is successful:sudo curl --no-buffer -XGET --unix-socket /tmp/datakit.sock http:/localhost/v1/ping. For more information on the test commands forcurl`, see here.

HTTP Request Frequency Control

Version-1.62.0 default enabled this limit.

As DataKit needs to receive a large number of external data writes, in order to avoid causing huge overhead to the host node, the following HTTP configuration can be modified (it is not turned on by default):

[http_api]
  request_rate_limit = 20.0 # Limit HTTP request(client IP + route) QPS

Other Settings

[http_api]
    close_idle_connection = true # Close idle connections
    timeout = "30s"              # Set server-side HTTP timeout

See here.

HTTP API Whitelist

Version-1.64.0

For security reasons, Datakit defaults to restricting access to some of its own APIs, which can only be accessed via localhost. If Datakit is deployed in a public network environment and there is a need to request these APIs over the public network (or from other machines in the local LAN), you can modify the following public_apis field configuration in the datakit.conf:

[http_api]
  public_apis = [
    # Allow access to the /metrics endpoint
    "/metrics",
  ]

By default, only the Ping interface and basic data upload interfaces are accessible from external sources, while all other interfaces are prohibited from external access. For collector-specific interfaces, such as those for trace collectors, they are accessible externally by default once the collector is enabled. For instructions on adding API whitelists in Kubernetes, refer to this section.

Global Tag Modification

Version-1.4.6

Datakit allows you to configure global tags for all collected data. Global tags are divided into two categories:

  • Global Host Tags(GET): Collected data is bound to the current host, such as CPU/memory metrics.
  • Global Election Tags(GET): Collected data comes from a common (remote) entity, such as MySQL/Redis, which generally participates in elections. Therefore, these data will not carry tags related to the current host.
[global_host_tags] # These are referred to as 'Global Host Tags'
  ip   = "__datakit_ip"
  host = "__datakit_hostname"

[election]
  [election.tags] # These are referred to as 'Global Election Tags'
    project = "my-project"
    cluster = "my-cluster"

When adding global tags, there are several points to note:

  1. The values of these global tags can use several wildcards currently supported by Datakit (both the double underscore (__) prefix and $ are acceptable):

    1. __datakit_ip/$datakit_ip: The tag value will be set to the first primary network card IP obtained by DataKit.
    2. __datakit_hostname/$datakit_hostname: The tag value will be set to the hostname of DataKit.
  2. Due to DataKit Data Transmission Protocol restrictions, do not include any metrics (Field) fields in the global tags, as this will lead to data processing failure due to protocol violation. For specific details, refer to the field list of the specific collector. Of course, do not add too many tags, and there are also restrictions on the length of each tag's Key and Value.

  3. If the collected data already contains a tag with the same name, DataKit will not append the configured global tag.
  4. Even if no configuration is specified in GHT, Datakit will still add a host=__datakit_hostname tag to GHT. This is because host is currently the default field for data connection. Therefore, all collected data, including logs, CPU, and memory metrics, will include the host tag.
  5. These two types of global tags (GHT/GET) can intersect, such as setting a tag of project = "my-project" in both.
  6. When no election is enabled, GET follows all tags in GHT (which has at least a host tag).
  7. Election-based collectors default to appending GET, and non-election-based collectors default to appending GHT.
How to distinguish between election and non-election collectors?

In the collector documentation, there is an identifier similar to the following at the top, which indicates the platform adaptation and collection characteristics of the current collector:

·

If it has , it means that the current collector is an election-based collector.

Settings of Global Tag in Remote Collection

Because DataKit will append the label host=<host name where DataKit is located> to all collected data by default, but this default appended host will cause trouble in some cases.

Take MySQL as an example, if MySQL is not on the DataKit machine, but you want this host tag to be the real hostname of MySQL being collected (or other identification fields of the cloud database), not the hostname of DataKit.

In this case, we can bypass the global tag on DataKit in two ways:

  • In the specific collector, there is generally the following configuration, and we can add a Tag here. For example, if we don't want DataKit to add the Tag host=xxx by default, we can overwrite this Tag here, taking MySQL as an example:
[[inputs.mysql.tags]]
  host = "real-mysql-host-name" 
Tip

Starting from version 1.4.20, DataKit defaults to using the IP/Host from the connection address of the collected service as the value for the host tag.

DataKit Own Running Log Configuration

DataKit has two own logs, one is its own run log(/var/log/datakit/log)and the other is HTTP Access log(/var/log/datakit/gin.log).

The default logging level for DataKit is info. Edit datakit.conf to modify the log level and slice size:

[logging]
  level = "debug" # correct info to debug
  rotate = 32     # each log slice is 32MB
  • level: When you set it to debug, you can see more logs (currently only the debug/info levels are supported).
  • rotate: DataKit slices the log by default. The default slice size is 32MB, and there are 6 slices in total (1 current write slice plus 5 cut slices, and the number of slices is not yet supported). If you dislike that DataKit logs take up too much disk space (maximum 32 x 6 = 192MB), reduce the rotate size (for example, change it to 4 in MB). HTTP access logs are automatically cut in the same way.

Advanced Configuration

The following content involves some advanced configuration. If you are not sure about the configuration, it is recommended to consult our technical experts.

Point Pool

Version-1.28.0 · Experimental

Point pool proved to be slow performance, do not enable it for production.

To optimize Datakit's memory usage under high load conditions, we can enable Point Pool to alleviate the pressure:

# datakit.conf
[point_pool]
    enable = true
    reserved_capacity = 4096

We can also enable content_encoding = "v2"( Version-1.32.0 has enabled v2 by default) under Dataway configure, with v2 encoding, it has lower memory and CPU overhead compared to v1.

Attention

While Datakit under low load(with a memory footprint of around), enable Point-Pool will eat more memory(we need more memory to cache unused data), but not excessively. The term "high load" typically refer to scenarios where memory consumption reach to 2GB or more. Enabling Point-Pool not only helps to memory usage but also improves Datakit's CPU consumption.

IO Module Parameter Adjustment

Version-1.4.8 · Experimental

In some cases, the data collection amount of DataKit is very large. If the network bandwidth is limited, some data collection may be interrupted or discarded. You can mitigate this problem by configuring some parameters of the io module:

[io]
  feed_chan_size  = 1     # length of compact queue
  max_cache_count = 1000  # data bulk sending points, beyond which sending is triggered in the cache
  flush_interval  = "10s" # threshold for sending data at least once every 10s
  flush_workers   = 0     # upload workers, default is the limited CPU-core * 2

See corresponding description in k8s for blocking mode

See here

Resource Limit

Because the amount of data processed on the DataKit cannot be estimated, if the resources consumed by the DataKit are not physically limited, it may consume a large amount of resources of the node where it is located. Here we can limit it with the help of cgroup in Linux or job object in Windows, which has the following configuration in datakit.conf:

[resource_limit]
  path = "/datakit" # Linux cgroup restricts directories, such as /sys/fs/cgroup/memory/datakit, /sys/fs/cgroup/cpu/datakit

  # Maximum CPU utilization allowed (percentile)
  cpu_max = 20.0

  # Allows 4GB of memory (memory + swap) by default
  # If set to 0 or negative, memory limits are not enabled
  mem_max_mb = 4096 

If the DataKit exceeds the memory limit, it will be forcibly killed by the operating system. The following results can be seen through the command, and the service needs to be started manually at this time.

$ systemctl status datakit 
 datakit.service - Collects data and upload it to DataFlux.
     Loaded: loaded (/etc/systemd/system/datakit.service; enabled; vendor preset: enabled)
     Active: activating (auto-restart) (Result: signal) since Fri 2022-02-30 16:39:25 CST; 1min 40s ago
    Process: 3474282 ExecStart=/usr/local/datakit/datakit (code=killed, signal=KILL)
   Main PID: 3474282 (code=killed, signal=KILL)
Attention
  • resource restriction will only be turned on by default during host installation.
  • resource limit only supports CPU usage and memory usage (mem + swap) controls, and only supports Linux and Windows ( Version-1.15.0) operating systems.
  • CPU usage controls is not supported in these windows systems: Windows 7, Windows Server 2008 R2, Windows Server 2008, Windows Vista, Windows Server 2003 and Windows XP.
  • When adjusting resource limit as a non-root user, it is essential to reinstall the service.
  • The CPU core count directly influences the configuration of worker threads in certain Datakit submodules. These worker threads, which handle specific tasks like data uploads, are typically set to a quantity that is a multiple of the total CPU cores. For instance, the data upload worker is commonly configured to be twice the number of CPU cores. Given that each individual upload worker consumes a default of 10MB of memory for data transmission, allocating a substantial number of CPU cores can lead to a significant increase in Datakit's overall memory footprint.
Tip

Datakit supports cgroup V2 from version 1.5.8. If you are unsure of the cgroup version, you can use this command mount | grep cgroup to check.

Datakit Usage Metering Standards

Version-1.29.0

To standardize the statistical measurement of Datakit usage, the following clarification is provided for the logical measurement method of Datakit:

  • If none of the following collectors are enabled, then the logical measurement count for Datakit is 1.
  • If the runtime of Datakit (with no more than a 30-minute interruption) exceeds 12 hours, it is counted for metering; otherwise, it is not counted.
  • For the following enabled collectors, the measurement is based on the current configured number of CPU cores of Datakit, with a minimum value of 1 and a maximum value equal to the number of physical CPU cores 1, rounding up any fractional part according to the rounding rules:

With these rules, it is possible to more accurately reflect the actual usage of Datakit, providing users with a more transparent and fair billing method.

Election Configuration

See here

Dataway Settings

Dataway got following settings to be configured:

  • timeout: The timeout for request data to Dataway. The default value is 30s
  • max_retry_count: Sets the number of retries to request Dataway (1 by default, max retry is 10) Version-1.17.0
  • retry_delay : Set the basic step of the retry interval. The default value is 200ms. The so-called basic step is 200ms for the first time, 400ms for the second time, 800ms for the third time, and so on (in increments of $2^n$) Version-1.17.0
  • max_raw_body_size: Set the maximum size of a single uploaded package (before compression), in bytes Version-1.17.1
  • content_encoding : v1 or v2 can be selected Version-1.17.1
    • v1 is line-protocol (default: v1)
    • v2 is the Protobuf protocol. Compared with v1, it has better performance in all aspects

See here for configuration under Kubernetes.

WAL Queue Configuration

Version-1.60.0

In the [dataway.wal] section, we can adjust the configuration of the WAL queue:

  [dataway.wal]
     max_capacity_gb = 2.0             # 2GB reserved disk space for each category (M/L/O/T/...)
     workers = 0                       # flush workers on WAL (default to CPU limited cores)
     mem_cap = 0                       # in-memory queue capacity (default to CPU limited cores)
     fail_cache_clean_interval = "30s" # duration for cleaning failed uploaded data

The disk files are located in the cache/dw-wal directory under the Datakit installation directory:

/usr/local/datakit/cache/dw-wal/
├── custom_object
   └── data
├── dialtesting
   └── data
├── dynamic_dw
   └── data
├── fc
   └── data
├── keyevent
   └── data
├── logging
   ├── data
   └── data.00000000000000000000000000000000
├── metric
   └── data
├── network
   └── data
├── object
   └── data
├── profiling
   └── data
├── rum
   └── data
├── security
   └── data
└── tracing
    └── data

13 directories, 14 files

Here, except for the fc directory, which is the failure retry queue, the other directories correspond to different data types. When data upload fails, these data will be cached in the fc directory, and Datakit will periodically upload them later.

Dataway Sinker

See here

Managing DataKit Configuration with Git

Because the configuration of various collectors in DataKit is text type, it takes a lot of energy to modify and take effect one by one. Here we can use Git to manage these configurations, with the following advantages:

  • Automatically synchronize the latest configuration from the remote Git repository and take effect automatically.
  • Git has its own version management, which can effectively track the change history of various configurations.

When you install DataKit(supported by DaemonSet installation and host installation), you can specify the Git configuration repository.

Manually Configure Git Administration

Datakit supports the use of git to manage collector configurations, Pipeline, and Python scripts. In datakit.conf, find the git_repos location and edit the following:

[git_repos]
  pull_interval = "1m" # Synchronize configuration interval, that is, synchronize once every 1 minute

  [[git_repos.repo]]
    enable = false   # Do not enable the repo

    ###########################################
    # Three protocols supported by Git address: http/git/ssh
    ###########################################
    url = "http://username:password@github.com/path/to/repository.git"

    # The following two protocols (git/ssh) need to be configured with key-path and key-password
    # url = "git@github.com:path/to/repository.git"
    # url = "ssh://git@github.com:9000/path/to/repository.git"
    # ssh_private_key_path = "/Users/username/.ssh/id_rsa"
    # ssh_private_key_password = "<YOUR-PASSSWORD>"

    branch = "master" # Specify git branch

Note: After Git synchronization is turned on, the collector configuration in the original conf.d directory will no longer take effect (except datakit.conf ).

Applying Git-managed Pipeline Sample

We can add Pipeline to the collector configuration to cut the logs of related services. When Git synchronization is turned on, both the Pipeline that comes with DataKit and the Pipeline synchronized by Git can be used. In the configuration of Nginx collector, a configuration example of Pipeline.

[[inputs.nginx]]
    ...
    [inputs.nginx.log]
    ...
    pipeline = "my-nginx.p" # Where to load my-nginx.p, see the "constraint" description below

Git-managed Usage Constraints

The following constraints must be followed when using Git:

  • Create a new conf.d folder in git repo, and put the DataKit collector configuration below
  • Create a new pipeline folder in git repo, and place the Pipeline file below
  • Create a new python.d folder in git repo, and place the Python script file below

The following is illustrated by legend:

datakit root directory
├── conf.d
├── data
├── pipeline # top-level Pipeline script
├── python.d # top-level python.d script
├── externals
└── gitrepos
    ├── repo-1  # warehouse 1
       ├── conf.d    # dedicated to store collector configuration
       ├── pipeline  # dedicated to storing pipeline cutting scripts
          └── my-nginx.p # legal pipeline script
          └── 123     # illegal Pipeline subdirectory, and the files under it will not take effect
              └── some-invalid.p
       └── python.d    store python.d scripts
           └── core
    └── repo-2  # warehouse 2
        ├── ...

The lookup priority is defined as follows:

  1. Find the specified file names one by one in the git_repos order configured in datakit.conf (it is an array that can configure multiple Git repositories), and return the first one if found. For example, look for my-nginx.p. If it is found under pipeline in the first repository directory, it will prevail. Even if there is my-nginx.p with the same name in the second repository, it will not be selected.

  2. If not found in git_repos , go to the <Datakit Installation Directory>/pipeline directory for the Pipeline script, or go to the <Datakit Installation Directory>/python.d directory for the Python script.

Locally set Pipeline default script

Version-1.61.0

Supports setting the default Pipeline script locally. If it conflicts with the default script set remotely, the local setting is preferred.

It can be configured in two ways:

  • Host deployment, you can specify the default scripts for each category in the DataKit main configuration file, as follows:

    # default pipeline
    [pipeline.default_pipeline]
    # logging = "<your_script.p>"
    # metric = "<your_script.p>"
    # tracing = "<your_script.p>"
    
  • Container deployment, you can use the environment variable, ENV_PIPELINE_DEFAULT_PIPELINE, its value is, for example, {"logging":"abc.p","metric":"xyz.p"}

Set the Maximum Value of Open File Descriptor

In a Linux environment, you can configure the ulimit entry in the Datakit main configuration file to set the maximum number of open files for Datakit, as follows:

ulimit = 64000

ulimit is configured to 64000 by default.

CPU Utilization Rate Description for Resource Limit

CPU utilization is on a percentage basis (maximum 100.0). For an 8-core CPU, if the limit cpu_max is 20.0 (that is, 20%), the maximum CPU consumption of DataKit, will be displayed as about 160% on the top command.

Collector Password Protection

Version-1.31.0

If you wish to avoid storing passwords in plain text in configuration files, you can utilize this feature.

When DataKit loads the collector configuration file during startup and encounters ENC[], it will replace the text with the password obtained from a file, environment variable, or AES encryption and reload it into memory to obtain the correct password.

ENC currently supports three methods:

  • File Format (Recommended):

Password format in the configuration file: ENC[file:///path/to/enc4dk]. Simply enter the correct password in the corresponding file.

  • AES Encryption Method:

You need to configure the secret key in the main configuration file datakit.conf: crypto_AES_key or crypto_AES_Key_filePath. The password should be formatted as: ENC[aes://5w1UiRjWuVk53k96WfqEaGUYJ/Oje7zr8xmBeGa3ugI=]

Here's an example using mysql to illustrate how to configure and use these methods:

1 File Format:

First, save the password in the file /usr/local/datakit/enc4mysql, then modify the configuration file mysql.conf:

# Partial configuration
[[inputs.mysql]]
  host = "localhost"
  user = "datakit"
  pass = "ENC[file:///usr/local/datakit/enc4mysql]"
  port = 3306
  # sock = "<SOCK>"
  # charset = "utf8"

DK will read the password from /usr/local/datakit/enc4mysql and replace it, resulting in pass = "Hello*******"

2 AES Encryption Method

First, configure the secret key in datakit.conf:

# Top-level field in the configuration file
# Secret key
crypto_AES_key = "0123456789abcdef"
# Or secret key file:
crypto_AES_Key_filePath = "/usr/local/datakit/mykey"

mysql.conf file:

pass = "ENC[aes://5w1UiRjWuVk53k96WfqEaGUYJ/Oje7zr8xmBeGa3ugI=]"

Note that the cipherText obtained through AES encryption needs to be filled in completely. Here is a code example:

// AESEncrypt.
func AESEncrypt(key []byte, plaintext string) (string, error) {
    block, err := aes.NewCipher(key)
    if err != nil {
        return "", err
    }

    // PKCS7 padding
    padding := aes.BlockSize - len(plaintext)%aes.BlockSize
    padtext := bytes.Repeat([]byte{byte(padding)}, padding)
    plaintext += string(padtext)
    ciphertext := make([]byte, aes.BlockSize+len(plaintext))
    iv := ciphertext[:aes.BlockSize]
    if _, err := io.ReadFull(rand.Reader, iv); err != nil {
        return "", err
    }
    mode := cipher.NewCBCEncrypter(block, iv)
    mode.CryptBlocks(ciphertext[aes.BlockSize:], []byte(plaintext))

    return base64.StdEncoding.EncodeToString(ciphertext), nil
}

// AESDecrypt AES.
func AESDecrypt(key []byte, cryptoText string) (string, error) {
    ciphertext, err := base64.StdEncoding.DecodeString(cryptoText)
    if err != nil {
        return "", err
    }

    block, err := aes.NewCipher(key)
    if err != nil {
        return "", err
    }

    if len(ciphertext) < aes.BlockSize {
        return "", fmt.Errorf("ciphertext too short")
    }

    iv := ciphertext[:aes.BlockSize]
    ciphertext = ciphertext[aes.BlockSize:]

    mode := cipher.NewCBCDecrypter(block, iv)
    mode.CryptBlocks(ciphertext, ciphertext)

    // Remove PKCS7 padding
    padding := int(ciphertext[len(ciphertext)-1])
    if padding > aes.BlockSize {
        return "", fmt.Errorf("invalid padding")
    }
    ciphertext = ciphertext[:len(ciphertext)-padding]

    return string(ciphertext), nil
}
import javax.crypto.Cipher;
import javax.crypto.spec.IvParameterSpec;
import javax.crypto.spec.SecretKeySpec;
import java.security.SecureRandom;
import java.util.Base64;

public class AESUtils {
    public static String AESEncrypt(byte[] key, String plaintext) throws Exception {
        javax.crypto.Cipher cipher = Cipher.getInstance("AES/CBC/PKCS5Padding");
        SecretKeySpec secretKeySpec = new SecretKeySpec(key, "AES");

        SecureRandom random = new SecureRandom();
        byte[] iv = new byte[16];
        random.nextBytes(iv);
        IvParameterSpec ivParameterSpec = new IvParameterSpec(iv);
        cipher.init(Cipher.ENCRYPT_MODE, secretKeySpec, ivParameterSpec);
        byte[] encrypted = cipher.doFinal(plaintext.getBytes());
        byte[] ivAndEncrypted = new byte[iv.length + encrypted.length];
        System.arraycopy(iv, 0, ivAndEncrypted, 0, iv.length);
        System.arraycopy(encrypted, 0, ivAndEncrypted, iv.length, encrypted.length);

        return Base64.getEncoder().encodeToString(ivAndEncrypted);
    }

    public static String AESDecrypt(byte[] key, String cryptoText) throws Exception {
        byte[] cipherText = Base64.getDecoder().decode(cryptoText);

        SecretKeySpec secretKeySpec = new SecretKeySpec(key, "AES");

        if (cipherText.length < 16) {
            throw new Exception("cipherText too short");
        }

        byte[] iv = new byte[16];
        System.arraycopy(cipherText, 0, iv, 0, 16);
        byte[] encrypted = new byte[cipherText.length - 16];
        System.arraycopy(cipherText, 16, encrypted, 0, cipherText.length - 16);

        Cipher cipher = Cipher.getInstance("AES/CBC/PKCS5Padding");
        IvParameterSpec ivParameterSpec = new IvParameterSpec(iv);
        cipher.init(Cipher.DECRYPT_MODE, secretKeySpec, ivParameterSpec);

        byte[] decrypted = cipher.doFinal(encrypted);

        return new String(decrypted);
    }
}    

In a K8S (Kubernetes) environment, private keys can be added through environment variables. The environment variables ENV_CRYPTO_AES_KEY and ENV_CRYPTO_AES_KEY_FILEPATH can be referenced for this purpose:DaemonSet 安装-其他

Remote Job


Version-1.63.0


DataKit receives tasks dispatched from the center and executes them. Currently, it supports the JVM dump function.

After installing DataKit (DK), two files will be generated in the template/service-task directory of the installation folder: jvm_dump_host_script.py and jvm_dump_k8s_script.py. The former is for the host machine mode, and the latter is for the virtual (Kubernetes) environment.

In the host machine environment, the current environment must have python3 and the requests package installed. If not, you need to install it using:

pip install requests
# or
pip3 install requests

In the Kubernetes (K8S) environment, access to the Kubernetes API is required, so Role-Based Access Control (RBAC) is necessary.

Config

config dir:

  • Linux/Mac: /usr/local/datakit/conf.d/datakit.conf
  • Windows: C:\Program Files\datakit\conf.d\datakit.conf

change conf:

[remote_job]
  enable=true
  envs=["OSS_BUCKET_HOST=<bucket_host>","OSS_ACCESS_KEY_ID=<key>","OSS_ACCESS_KEY_SECRET=<secret key>","OSS_BUCKET_NAME=<name>"]
  interval="100s"
  java_home=""

Add RBAC access:

---

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: datakit
rules:
- apiGroups: ["rbac.authorization.k8s.io"]
  resources: ["clusterroles"]
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources: ["nodes", "nodes/stats", "nodes/metrics", "namespaces", "pods", "pods/log", "events", "services", "endpoints", "persistentvolumes", "persistentvolumeclaims", "pods/exec"]
  verbs: ["get", "list", "watch", "create"]
- apiGroups: ["apps"]
  resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
  resources: ["jobs", "cronjobs"]
  verbs: [ "get", "list", "watch"]
- apiGroups: ["guance.com"]
  resources: ["datakits"]
  verbs: ["get","list"]
- apiGroups: ["monitoring.coreos.com"]
  resources: ["podmonitors", "servicemonitors"]
  verbs: ["get", "list"]
- apiGroups: ["metrics.k8s.io"]
  resources: ["pods", "nodes"]
  verbs: ["get", "list"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]

---

In the above configuration, "pod/exec" is added, and the rest should be consistent with datakit.yaml.

Add ENV for remote_job:

- name: ENV_REMOTE_JOB_ENABLE
  value: 'true'
- name: ENV_REMOTE_JOB_ENVS
  value: >-
    OSS_BUCKET_HOST=<bucket host>,OSS_ACCESS_KEY_ID=<key>,OSS_ACCESS_KEY_SECRET=<secret key>,OSS_BUCKET_NAME=<name>
- name: ENV_REMOTE_JOB_JAVA_HOME
- name: ENV_REMOTE_JOB_INTERVAL
  value: 100s

Configuration file description:

  1. enable ENV_REMOTE_JOB_ENABLE remote_job Function switch.
  2. envs ENV_REMOTE_JOB_ENVS OSS configuration, including OSS host access key secret key bucket information, and send the obtained JVM dump file to OSS.
  3. interval ENV_REMOTE_JOB_INTERVAL The time interval at which DataKit actively calls the interface to obtain the latest tasks.
  4. java_home ENV_REMOTE_JOB_JAVA_HOME The host environment is automatically obtained from the environment variable ($JAVA_HOME) and does not need to be configured.

Please note that the version of the Agent: dd-java-agent.jar used should not be lower than v1.4.0-guance.

Extended Readings


  1. If the resource limit on CPU not set, then N use the machine/node CPU cores 

Feedback

Is this page helpful? ×