summaryrefslogtreecommitdiff
path: root/ironic_python_agent/numa_inspector.py
blob: 3f5ef9d84ee2d92f3cb8ec6b47afa6d822f28d7a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# Copyright 2017 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from oslo_log import log
import pint

from ironic_python_agent import errors

LOG = log.getLogger(__name__)

UNIT_CONVERTER = pint.UnitRegistry(filename=None)
UNIT_CONVERTER.define('kB = []')
UNIT_CONVERTER.define('KB = []')
UNIT_CONVERTER.define('MB = 1024 KB')
UNIT_CONVERTER.define('GB = 1048576 KB')


def get_numa_node_id(numa_node_dir):
    """Provides the NUMA node id from NUMA node directory

    :param numa_node_dir: NUMA node directory
    :raises: IncompatibleNumaFormatError: when unexpected format data
             in NUMA node dir

    :return: NUMA node id
    """
    try:
        return int(os.path.basename(numa_node_dir)[4:])
    except (IOError, ValueError, IndexError) as exc:
        msg = ('Failed to get NUMA node id for %(node)s: '
               '%(error)s' % {'node': numa_node_dir, 'error': exc})
        raise errors.IncompatibleNumaFormatError(msg)


def get_nodes_memory_info(numa_node_dirs):
    """Collect the NUMA nodes memory information.

    The information is returned in the form of::

        "ram": [{"numa_node": <numa_node_id>, "size_kb": <memory_in_kb>}, ...]

    :param numa_node_dirs: A list of NUMA node directories
    :raises: IncompatibleNumaFormatError: when unexpected format data
             in NUMA node

    :return: A list of memory information with NUMA node id
    """
    ram = []
    for numa_node_dir in numa_node_dirs:
        numa_node_memory = {}
        numa_node_id = get_numa_node_id(numa_node_dir)
        try:
            with open(os.path.join(numa_node_dir,
                      'meminfo')) as meminfo_file:
                for line in meminfo_file:
                    if 'MemTotal' in line:
                        break
                else:
                    msg = ('Memory information is not available for '
                           '%(node)s' % {'node': numa_node_dir})
                    raise errors.IncompatibleNumaFormatError(msg)
        except IOError as exc:
            msg = ('Failed to get memory information '
                   'for %(node)s: %(error)s' %
                   {'node': numa_node_dir, 'error': exc})
            raise errors.IncompatibleNumaFormatError(msg)
        try:
            # To get memory size with unit from memory info line
            # Memory info sample line format 'Node 0 MemTotal: 1560000 kB'
            value = line.split(":")[1].strip()
            memory_kb = int(UNIT_CONVERTER(value).to_base_units().magnitude)
        except (ValueError, IndexError, pint.UndefinedUnitError) as exc:
            msg = ('Failed to get memory information for %(node)s: '
                   '%(error)s' % {'node': numa_node_dir, 'error': exc})
            raise errors.IncompatibleNumaFormatError(msg)
        numa_node_memory['numa_node'] = numa_node_id
        numa_node_memory['size_kb'] = memory_kb
        LOG.debug('Found memory available %d KB in NUMA node %d',
                  memory_kb, numa_node_id)
        ram.append(numa_node_memory)
    return ram


def get_nodes_cores_info(numa_node_dirs):
    """Collect the NUMA nodes cpu's and thread's information.

    NUMA nodes path: /sys/devices/system/node/node<node_id>

    Thread dirs path: /sys/devices/system/node/node<node_id>/cpu<thread_id>

    CPU id file path: /sys/devices/system/node/node<node_id>/cpu<thread_id>/
                      topology/core_id

    The information is returned in the form of::

        "cpus": [
              {
                "cpu": <cpu_id>, "numa_node": <numa_node_id>,
                "thread_siblings": [<list of sibling threads>]
              },
              ...,
            ]

    :param numa_node_dirs: A list of NUMA node directories
    :raises: IncompatibleNumaFormatError: when unexpected format data
             in NUMA node

    :return: A list of cpu information with NUMA node id and thread siblings
    """
    dict_cpus = {}
    for numa_node_dir in numa_node_dirs:
        numa_node_id = get_numa_node_id(numa_node_dir)
        try:
            thread_dirs = os.listdir(numa_node_dir)
        except OSError as exc:
            msg = ('Failed to get list of threads for %(node)s: '
                   '%(error)s' % {'node': numa_node_dir, 'error': exc})
            raise errors.IncompatibleNumaFormatError(msg)
        for thread_dir in thread_dirs:
            if (not os.path.isdir(os.path.join(numa_node_dir, thread_dir))
                or not thread_dir.startswith("cpu")):
                continue
            try:
                thread_id = int(thread_dir[3:])
            except (ValueError, IndexError) as exc:
                msg = ('Failed to get cores information for '
                       '%(node)s: %(error)s' %
                       {'node': numa_node_dir, 'error': exc})
                raise errors.IncompatibleNumaFormatError(msg)
            try:
                with open(os.path.join(numa_node_dir, thread_dir, 'topology',
                          'core_id')) as core_id_file:
                    cpu_id = int(core_id_file.read().strip())
            except (IOError, ValueError) as exc:
                msg = ('Failed to gather cpu_id for thread'
                       '%(thread)s NUMA node %(node)s: %(error)s' %
                       {'thread': thread_dir, 'node': numa_node_dir,
                        'error': exc})
                raise errors.IncompatibleNumaFormatError(msg)
            # CPU and NUMA node together forms a unique value, as cpu_id is
            # specific to a NUMA node
            # NUMA node id and cpu id tuple is used for unique key
            dict_key = numa_node_id, cpu_id
            if dict_key in dict_cpus:
                if thread_id not in dict_cpus[dict_key]['thread_siblings']:
                    dict_cpus[dict_key]['thread_siblings'].append(thread_id)
            else:
                cpu_item = {}
                cpu_item['thread_siblings'] = [thread_id]
                cpu_item['cpu'] = cpu_id
                cpu_item['numa_node'] = numa_node_id
                dict_cpus[dict_key] = cpu_item
            LOG.debug('Found a thread sibling %d for CPU %d in NUMA node %d',
                      thread_id, cpu_id, numa_node_id)
    return list(dict_cpus.values())


def get_nodes_nics_info(nic_device_path):
    """Collect the NUMA nodes nics information.

    The information is returned in the form of::

        "nics": [
              {"name": "<network interface name>",
               "numa_node": <numa_node_id>},
              ...,
            ]

    :param nic_device_path: nic device directory path
    :raises: IncompatibleNumaFormatError: when unexpected format data
             in NUMA node

    :return: A list of nics information with NUMA node id
    """
    nics = []
    if not os.path.isdir(nic_device_path):
        msg = ('Failed to get list of NIC\'s, NIC device path '
               'does not exist: %(nic_device_path)s' %
               {'nic_device_path': nic_device_path})
        raise errors.IncompatibleNumaFormatError(msg)
    for nic_dir in os.listdir(nic_device_path):
        if not os.path.isfile(os.path.join(nic_device_path,
                                           nic_dir, 'device', 'numa_node')):
            continue
        try:
            with open(os.path.join(nic_device_path, nic_dir, 'device',
                                   'numa_node')) as nicsinfo_file:
                numa_node_id = int(nicsinfo_file.read().strip())
        except (IOError, ValueError) as exc:
            msg = ('Failed to gather NIC\'s for NUMA node %(node)s: '
                   '%(error)s' % {'node': nic_dir, 'error': exc})
            raise errors.IncompatibleNumaFormatError(msg)
        numa_node_nics = {}
        numa_node_nics['name'] = nic_dir
        numa_node_nics['numa_node'] = numa_node_id
        LOG.debug('Found a NIC %s in NUMA node %d', nic_dir,
                  numa_node_id)
        nics.append(numa_node_nics)
    return nics


def collect_numa_topology_info(data, failures):
    """Collect the NUMA topology information.

    The data is gathered from /sys/devices/system/node/node<X> and
    /sys/class/net/ directories. The information is collected in the form of::

      {
        "numa_topology": {
          "ram": [{"numa_node": <numa_node_id>, "size_kb": <memory_in_kb>},
                   ...],
          "cpus": [
            {
              "cpu": <cpu_id>, "numa_node": <numa_node_id>,
              "thread_siblings": [<list of sibling threads>]
            },
            ...,
          ],
          "nics": [
            {"name": "<network interface name>", "numa_node": <numa_node_id>},
            ...,
          ]
        }
      }

    :param data: mutable data that we'll send to inspector
    :param failures: AccumulatedFailures object

    :return: None
    """
    numa_node_path = '/sys/devices/system/node/'
    nic_device_path = '/sys/class/net/'
    numa_info = {}
    numa_node_dirs = []
    if not os.path.isdir(numa_node_path):
        LOG.warning('Failed to get list of NUMA nodes, NUMA node path '
                    'does not exist: %s', numa_node_path)
        return
    for numa_node_dir in os.listdir(numa_node_path):
        numa_node_dir_path = os.path.join(numa_node_path, numa_node_dir)
        if (os.path.isdir(numa_node_dir_path)
            and numa_node_dir.startswith("node")):
            numa_node_dirs.append(numa_node_dir_path)
    try:
        numa_info['ram'] = get_nodes_memory_info(numa_node_dirs)
        numa_info['cpus'] = get_nodes_cores_info(numa_node_dirs)
        numa_info['nics'] = get_nodes_nics_info(nic_device_path)
    except errors.IncompatibleNumaFormatError as exc:
        LOG.warning('Failed to get some NUMA information (%s)', exc)
        return
    data['numa_topology'] = numa_info