Skip to content

Commit

Permalink
Refactor job data set analysis fields.
Browse files Browse the repository at this point in the history
Primary author was @jtpalmer

Co-Authored-By: Jeffrey T. Palmer <[email protected]>
  • Loading branch information
aaronweeden and jtpalmer committed Dec 21, 2023
1 parent 108f041 commit 84f47e8
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 129 deletions.
155 changes: 52 additions & 103 deletions classes/DataWarehouse/Query/SUPREMM/JobDataset.php
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,45 @@ public function __construct(
*/
private function addFieldByDefinition(array $fieldDef)
{
$tableAlias = $fieldDef['tableAlias'];
$table = null;
if (array_key_exists($tableAlias, $this->tables)) {
$table = $this->tables[$tableAlias];
} elseif (array_key_exists($tableAlias, $this->tableDefs)) {
$table = $this->addTableByDefinition($this->tableDefs[$tableAlias]);
if (array_key_exists('tableAlias', $fieldDef)) {
$tableAlias = $fieldDef['tableAlias'];
if (array_key_exists($tableAlias, $this->tables)) {
$table = $this->tables[$tableAlias];
} elseif (array_key_exists($tableAlias, $this->tableDefs)) {
$table = $this->addTableByDefinition($this->tableDefs[$tableAlias]);
} else {
throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias));
}
}

if (!array_key_exists('alias', $fieldDef)) {
throw new \Exception(sprintf('Missing alias for definition: %s', json_encode($fieldDef)));
}
$alias = $fieldDef['alias'];

if ($table !== null && array_key_exists('column', $fieldDef)) {
$this->addField(new TableField($table, $fieldDef['column'], $alias));
} elseif (array_key_exists('formula', $fieldDef)) {
$this->addField(new FormulaField($fieldDef['formula'], $alias));
} else {
throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias));
throw new \Exception(sprintf(
'Missing tableAlias and column or formula for "%s", definition: %s',
$alias,
json_encode($fieldDef)
));
}

$this->documentation[$alias] = $fieldDef;

if (array_key_exists('withError', $fieldDef)) {
$errorDef = $fieldDef['withError'];
$this->addErrorField(
$errorDef['column'],
$errorDef['tableAlias'],
array_key_exists('name', $errorDef) ? $errorDef['name'] : null
);
}
$this->addField(new TableField($table, $fieldDef['column'], $fieldDef['alias']));
$this->documentation[$fieldDef['alias']] = $fieldDef;
}

/**
Expand Down Expand Up @@ -228,21 +256,25 @@ private function joinTo($othertable, $joinkey, $otherkey, $colalias, $idcol = "i
}

/**
* Add a field and the corresponding error field to the query.
* Add an error field to the query.
*
* @param Field $field The field to add to the query.
* @param string $fieldName The name of the field.
* @param Table $errorTable The error table.
* @param string $errorTableAlias The error table alias.
* @param string $errorName The name of the error field, if null then the field name is
* autogenerated based on the fieldName.
*
* @return null
*/
private function addFieldWithError($field, $fieldName, $errorTable, $errorName = null)
private function addErrorField($fieldName, $errorTableAlias, $errorName = null)
{
static $errorTableIdx = 0;
$errorTable = null;
if (array_key_exists($errorTableAlias, $this->tables)) {
$errorTable = $this->tables[$errorTableAlias];
} elseif (array_key_exists($errorTableAlias, $this->tableDefs)) {
$errorTable = $this->addTableByDefinition($this->tableDefs[$errorTableAlias]);
} else {
throw new \Exception(sprintf('Unrecognized table alias "%s"', $errorTableAlias));
}

$this->addField($field);
static $errorTableIdx = 0;

$errordesc = new Table(
new Schema('modw'),
Expand Down Expand Up @@ -307,98 +339,15 @@ private function addMetricsFields()

private function addAnalyticsFields()
{
$dataTable = $this->getDataTable();
$joberrors = new Table(new Schema('modw_supremm'), 'job_errors', 'je');
$this->addTable($joberrors);

$this->addWhereCondition(
new WhereCondition(
new TableField($dataTable, '_id'),
'=',
new TableField($joberrors, '_id')
)
);

foreach ($this->fieldDefs as $sfield => $sdata) {
// TODO work out a better way to have metrics have multiple
// meta-types (ie cpu user is an analytic as well as a metric).
if ($sfield == "cpu_user") {
$this->addFieldWithError(new TableField($dataTable, $sfield), $sfield, $joberrors);
$this->documentation[$sfield] = $sdata;
$this->addFieldByDefinition($sdata);
$this->addErrorField($sfield, 'je');
} elseif ($sdata['dtype'] == 'analysis') {
$this->addFieldByDefinition($sdata);
}
}
$this->addFieldWithError(
new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"),
'catastrophe',
$joberrors,
'homogeneity_error'
);
$this->documentation['homogeneity'] = array(
'name'=> 'Homogeneity',
'units' => 'ratio',
'per' => 'job',
'visibility' => 'public',
'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job.
Jobs with a low homogeneity value (~0) should be investigated to check if there
has been a catastrophic failure during the job',
'batchExport' => true,
'dtype' => 'analysis'
);

$this->addFieldWithError(
new FormulaField('(1.0 - (jf.cpu_user_imbalance/100.0))', 'cpu_user_balance'),
'cpu_user_imbalance',
$joberrors,
'cpu_user_balance_error'
);
$this->documentation['cpu_user_balance'] = array(
'name'=> 'CPU User Balance',
'units' => 'ratio',
'per' => 'job',
'visibility' => 'public',
'documentation' => 'A measure of how uniform the CPU usage is between the cores that the job was
assigned. A value of CPU User Balance near 1 corresponds to a job with evenly
loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores
with much lower utilization that the others.',
'batchExport' => true,
'dtype' => 'analysis'
);

$this->addFieldWithError(
new FormulaField('(1.0 - 1.0/POW(2-jf.max_memory, 5))', 'mem_coefficient'),
'max_memory',
$joberrors,
'mem_coefficient_error'
);
$this->documentation['mem_coefficient'] = array(
'name'=> 'Memory Headroom',
'units' => 'ratio',
'per' => 'job',
'visibility' => 'public',
'documentation' => 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds
to a job which used all of the available memory and 1 corresponds to a job with low memory usage.
The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for
the compute node that had the highest memory usage.',
'batchExport' => true,
'dtype' => 'analysis'
);

$this->addFieldWithError(
new FormulaField("LEAST(jf.wall_time / jf.requested_wall_time, 1)", "wall_accuracy"),
'requested_wall_time',
$joberrors,
'requested_wall_time_error'
);
$this->documentation['wall_accuracy'] = array(
'name'=> 'Walltime Accuracy',
'units' => 'ratio',
'per' => 'job',
'visibility' => 'public',
'documentation' => 'The ratio of actual wall time to requested wall time. A value near 1 indicates that
the requested wall time close to the actual wall time. A good wall time accuracy improves
system wide scheduling.',
'batchExport' => true,
'dtype' => 'analysis'
);
}
}
47 changes: 23 additions & 24 deletions docs/customization.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ title: Customization

This document describes some advanced customizations for the Job Performance module.

**The automated upgade scripts do not have any support for preserving
**The automated upgrade scripts do not have any support for preserving
customizations. Any changes made to the underlying Open XDMoD source code
will likely be overwitten when the software us upgraded.**
will likely be overwritten when the software is upgraded.**

## Job Analytics

Expand Down Expand Up @@ -44,26 +44,25 @@ hardware support), then the Open XDMoD instance can be customized to never show
**These instructions only apply to Open XDMoD {{ page.sw_version }}. For other
versions please refer to the documentation for that release.**

To remove an analytic, you need to edit `/usr/share/xdmod/classes/DataWarehouse/Query/SUPREMM/JobDataset.php`
and remove the code associated with the analytic. For example, to remove the homogeneity
analytic you would remove (or comment out) lines 330-346. I.e. the function call to `addFieldWithError` and the
update to the documentation object. The lines to remove are shown below.
```php
330 $this->addFieldWithError(
331 new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"),
332 'catastrophe',
333 $joberrors,
334 'homogeneity_error'
335 );
336 $this->documentation['homogeneity'] = array(
337 'name'=> 'Homogeneity',
338 'units' => 'ratio',
339 'per' => 'job',
340 'visibility' => 'public',
341 'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job.
342 Jobs with a low homogeneity value (~0) should be investigated to check if there
343 has been a catastrophic failure during the job',
344 'batchExport' => true,
345 'dtype' => 'analysis'
346 );
To remove an analytic you need to edit `/usr/share/xdmod/etl/js/config/supremm/etl.schema.js`
and remove the code associated with the analytic. For example to remove the homogeneity
analytic you would remove (or comment out) lines 2716–2732. The lines to remove are shown below.
```js
2716 homogeneity: {
2717 name: 'Homogeneity',
2718 formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))',
2719 withError: {
2720 name: 'homogeneity_error',
2721 column: 'catastrophe',
2722 tableAlias: 'je'
2723 },
2724 unit: 'ratio',
2725 per: 'job',
2726 visibility: 'public',
2727 comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. ' +
2728 'Jobs with a low homogeneity value (~0) should be investigated to check if there ' +
2729 'has been a catastrophic failure during the job',
2730 batchExport: true,
2731 dtype: 'analysis'
2732 },
```
92 changes: 90 additions & 2 deletions etl/js/config/supremm/etl.schema.js
Original file line number Diff line number Diff line change
Expand Up @@ -1080,7 +1080,7 @@ module.exports = {
catastrophe: {
unit: "ratio",
type: "double",
dtype: "analysis",
dtype: 'ignore',
nullable: true,
def: null,
batchExport: true,
Expand Down Expand Up @@ -1323,7 +1323,7 @@ module.exports = {
cpu_user_imbalance: {
unit: "%",
type: "double",
dtype: "analysis",
dtype: 'ignore',
nullable: true,
def: null,
batchExport: true,
Expand Down Expand Up @@ -2681,6 +2681,19 @@ module.exports = {
// Include columns from this table in the raw statistics configuration.
table: 'modw_supremm.job',

tables: [
{
schema: 'modw_supremm',
name: 'job_errors',
alias: 'je',
join: {
primaryKey: '_id',
foreignTableAlias: 'jf',
foreignKey: '_id'
}
}
],

// Fields not already defined as part of the ETL schema.
fields: {
timezone: {
Expand All @@ -2698,6 +2711,81 @@ module.exports = {
foreignKey: 'resource_id',
column: 'timezone'
}
},
// Note that the code below is referenced in docs/customization.md.
homogeneity: {
name: 'Homogeneity',
formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))',
withError: {
name: 'homogeneity_error',
column: 'catastrophe',
tableAlias: 'je'
},
unit: 'ratio',
per: 'job',
visibility: 'public',
comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. ' +
'Jobs with a low homogeneity value (~0) should be investigated to check if there ' +
'has been a catastrophic failure during the job',
batchExport: true,
dtype: 'analysis',
group: 'Other'
},
cpu_user_balance: {
name: 'CPU User Balance',
formula: '(1.0 - (jf.cpu_user_imbalance/100.0))',
withError: {
name: 'cpu_user_balance_error',
column: 'cpu_user_imbalance',
tableAlias: 'je'
},
unit: 'ratio',
per: 'job',
visibility: 'public',
comments: 'A measure of how uniform the CPU usage is between the cores that the job was ' +
'assigned. A value of CPU User Balance near 1 corresponds to a job with evenly ' +
'loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores ' +
'with much lower utilization that the others.',
batchExport: true,
dtype: 'analysis',
group: 'Other'
},
mem_coefficient: {
name: 'Memory Headroom',
formula: '(1.0 - 1.0/POW(2-jf.max_memory, 5))',
withError: {
name: 'mem_coefficient_error',
column: 'max_memory',
tableAlias: 'je'
},
unit: 'ratio',
per: 'job',
visibility: 'public',
comments: 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds ' +
'to a job which used all of the available memory and 1 corresponds to a job with low memory usage. ' +
'The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for ' +
'the compute node that had the highest memory usage.',
batchExport: true,
dtype: 'analysis',
group: 'Other'
},
wall_accuracy: {
name: 'Walltime Accuracy',
formula: 'LEAST(jf.wall_time / jf.requested_wall_time, 1)',
withError: {
name: 'requested_wall_time_error',
column: 'requested_wall_time',
tableAlias: 'je'
},
unit: 'ratio',
per: 'job',
visibility: 'public',
comments: 'The ratio of actual wall time to requested wall time. A value near 1 indicates that ' +
'the requested wall time close to the actual wall time. A good wall time accuracy improves ' +
'system wide scheduling.',
batchExport: true,
dtype: 'analysis',
group: 'Other'
}
}
}
Expand Down

0 comments on commit 84f47e8

Please sign in to comment.