diff --git a/classes/DataWarehouse/Query/SUPREMM/JobDataset.php b/classes/DataWarehouse/Query/SUPREMM/JobDataset.php index 6fe15342b..acd17ac85 100644 --- a/classes/DataWarehouse/Query/SUPREMM/JobDataset.php +++ b/classes/DataWarehouse/Query/SUPREMM/JobDataset.php @@ -180,17 +180,45 @@ public function __construct( */ private function addFieldByDefinition(array $fieldDef) { - $tableAlias = $fieldDef['tableAlias']; $table = null; - if (array_key_exists($tableAlias, $this->tables)) { - $table = $this->tables[$tableAlias]; - } elseif (array_key_exists($tableAlias, $this->tableDefs)) { - $table = $this->addTableByDefinition($this->tableDefs[$tableAlias]); + if (array_key_exists('tableAlias', $fieldDef)) { + $tableAlias = $fieldDef['tableAlias']; + if (array_key_exists($tableAlias, $this->tables)) { + $table = $this->tables[$tableAlias]; + } elseif (array_key_exists($tableAlias, $this->tableDefs)) { + $table = $this->addTableByDefinition($this->tableDefs[$tableAlias]); + } else { + throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias)); + } + } + + if (!array_key_exists('alias', $fieldDef)) { + throw new \Exception(sprintf('Missing alias for definition: %s', json_encode($fieldDef))); + } + $alias = $fieldDef['alias']; + + if ($table !== null && array_key_exists('column', $fieldDef)) { + $this->addField(new TableField($table, $fieldDef['column'], $alias)); + } elseif (array_key_exists('formula', $fieldDef)) { + $this->addField(new FormulaField($fieldDef['formula'], $alias)); } else { - throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias)); + throw new \Exception(sprintf( + 'Missing tableAlias and column or formula for "%s", definition: %s', + $alias, + json_encode($fieldDef) + )); + } + + $this->documentation[$alias] = $fieldDef; + + if (array_key_exists('withError', $fieldDef)) { + $errorDef = $fieldDef['withError']; + $this->addErrorField( + $errorDef['column'], + $errorDef['tableAlias'], + array_key_exists('name', $errorDef) ? $errorDef['name'] : null + ); } - $this->addField(new TableField($table, $fieldDef['column'], $fieldDef['alias'])); - $this->documentation[$fieldDef['alias']] = $fieldDef; } /** @@ -228,21 +256,25 @@ private function joinTo($othertable, $joinkey, $otherkey, $colalias, $idcol = "i } /** - * Add a field and the corresponding error field to the query. + * Add an error field to the query. * - * @param Field $field The field to add to the query. * @param string $fieldName The name of the field. - * @param Table $errorTable The error table. + * @param string $errorTableAlias The error table alias. * @param string $errorName The name of the error field, if null then the field name is * autogenerated based on the fieldName. - * - * @return null */ - private function addFieldWithError($field, $fieldName, $errorTable, $errorName = null) + private function addErrorField($fieldName, $errorTableAlias, $errorName = null) { - static $errorTableIdx = 0; + $errorTable = null; + if (array_key_exists($errorTableAlias, $this->tables)) { + $errorTable = $this->tables[$errorTableAlias]; + } elseif (array_key_exists($errorTableAlias, $this->tableDefs)) { + $errorTable = $this->addTableByDefinition($this->tableDefs[$errorTableAlias]); + } else { + throw new \Exception(sprintf('Unrecognized table alias "%s"', $errorTableAlias)); + } - $this->addField($field); + static $errorTableIdx = 0; $errordesc = new Table( new Schema('modw'), @@ -307,98 +339,15 @@ private function addMetricsFields() private function addAnalyticsFields() { - $dataTable = $this->getDataTable(); - $joberrors = new Table(new Schema('modw_supremm'), 'job_errors', 'je'); - $this->addTable($joberrors); - - $this->addWhereCondition( - new WhereCondition( - new TableField($dataTable, '_id'), - '=', - new TableField($joberrors, '_id') - ) - ); - foreach ($this->fieldDefs as $sfield => $sdata) { // TODO work out a better way to have metrics have multiple // meta-types (ie cpu user is an analytic as well as a metric). if ($sfield == "cpu_user") { - $this->addFieldWithError(new TableField($dataTable, $sfield), $sfield, $joberrors); - $this->documentation[$sfield] = $sdata; + $this->addFieldByDefinition($sdata); + $this->addErrorField($sfield, 'je'); + } elseif ($sdata['dtype'] == 'analysis') { + $this->addFieldByDefinition($sdata); } } - $this->addFieldWithError( - new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"), - 'catastrophe', - $joberrors, - 'homogeneity_error' - ); - $this->documentation['homogeneity'] = array( - 'name'=> 'Homogeneity', - 'units' => 'ratio', - 'per' => 'job', - 'visibility' => 'public', - 'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job. - Jobs with a low homogeneity value (~0) should be investigated to check if there - has been a catastrophic failure during the job', - 'batchExport' => true, - 'dtype' => 'analysis' - ); - - $this->addFieldWithError( - new FormulaField('(1.0 - (jf.cpu_user_imbalance/100.0))', 'cpu_user_balance'), - 'cpu_user_imbalance', - $joberrors, - 'cpu_user_balance_error' - ); - $this->documentation['cpu_user_balance'] = array( - 'name'=> 'CPU User Balance', - 'units' => 'ratio', - 'per' => 'job', - 'visibility' => 'public', - 'documentation' => 'A measure of how uniform the CPU usage is between the cores that the job was - assigned. A value of CPU User Balance near 1 corresponds to a job with evenly - loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores - with much lower utilization that the others.', - 'batchExport' => true, - 'dtype' => 'analysis' - ); - - $this->addFieldWithError( - new FormulaField('(1.0 - 1.0/POW(2-jf.max_memory, 5))', 'mem_coefficient'), - 'max_memory', - $joberrors, - 'mem_coefficient_error' - ); - $this->documentation['mem_coefficient'] = array( - 'name'=> 'Memory Headroom', - 'units' => 'ratio', - 'per' => 'job', - 'visibility' => 'public', - 'documentation' => 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds - to a job which used all of the available memory and 1 corresponds to a job with low memory usage. - The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for - the compute node that had the highest memory usage.', - 'batchExport' => true, - 'dtype' => 'analysis' - ); - - $this->addFieldWithError( - new FormulaField("LEAST(jf.wall_time / jf.requested_wall_time, 1)", "wall_accuracy"), - 'requested_wall_time', - $joberrors, - 'requested_wall_time_error' - ); - $this->documentation['wall_accuracy'] = array( - 'name'=> 'Walltime Accuracy', - 'units' => 'ratio', - 'per' => 'job', - 'visibility' => 'public', - 'documentation' => 'The ratio of actual wall time to requested wall time. A value near 1 indicates that - the requested wall time close to the actual wall time. A good wall time accuracy improves - system wide scheduling.', - 'batchExport' => true, - 'dtype' => 'analysis' - ); } } diff --git a/docs/customization.md b/docs/customization.md index 3661489f8..0b9c70a6b 100644 --- a/docs/customization.md +++ b/docs/customization.md @@ -4,9 +4,9 @@ title: Customization This document describes some advanced customizations for the Job Performance module. -**The automated upgade scripts do not have any support for preserving +**The automated upgrade scripts do not have any support for preserving customizations. Any changes made to the underlying Open XDMoD source code -will likely be overwitten when the software us upgraded.** +will likely be overwritten when the software is upgraded.** ## Job Analytics @@ -44,26 +44,25 @@ hardware support), then the Open XDMoD instance can be customized to never show **These instructions only apply to Open XDMoD {{ page.sw_version }}. For other versions please refer to the documentation for that release.** -To remove an analytic, you need to edit `/usr/share/xdmod/classes/DataWarehouse/Query/SUPREMM/JobDataset.php` -and remove the code associated with the analytic. For example, to remove the homogeneity -analytic you would remove (or comment out) lines 330-346. I.e. the function call to `addFieldWithError` and the -update to the documentation object. The lines to remove are shown below. -```php -330 $this->addFieldWithError( -331 new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"), -332 'catastrophe', -333 $joberrors, -334 'homogeneity_error' -335 ); -336 $this->documentation['homogeneity'] = array( -337 'name'=> 'Homogeneity', -338 'units' => 'ratio', -339 'per' => 'job', -340 'visibility' => 'public', -341 'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job. -342 Jobs with a low homogeneity value (~0) should be investigated to check if there -343 has been a catastrophic failure during the job', -344 'batchExport' => true, -345 'dtype' => 'analysis' -346 ); +To remove an analytic you need to edit `/usr/share/xdmod/etl/js/config/supremm/etl.schema.js` +and remove the code associated with the analytic. For example to remove the homogeneity +analytic you would remove (or comment out) lines 2716–2732. The lines to remove are shown below. +```js +2716 homogeneity: { +2717 name: 'Homogeneity', +2718 formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))', +2719 withError: { +2720 name: 'homogeneity_error', +2721 column: 'catastrophe', +2722 tableAlias: 'je' +2723 }, +2724 unit: 'ratio', +2725 per: 'job', +2726 visibility: 'public', +2727 comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. ' + +2728 'Jobs with a low homogeneity value (~0) should be investigated to check if there ' + +2729 'has been a catastrophic failure during the job', +2730 batchExport: true, +2731 dtype: 'analysis' +2732 }, ``` diff --git a/etl/js/config/supremm/etl.schema.js b/etl/js/config/supremm/etl.schema.js index 457f7059d..c4fba38fc 100644 --- a/etl/js/config/supremm/etl.schema.js +++ b/etl/js/config/supremm/etl.schema.js @@ -1080,7 +1080,7 @@ module.exports = { catastrophe: { unit: "ratio", type: "double", - dtype: "analysis", + dtype: 'ignore', nullable: true, def: null, batchExport: true, @@ -1323,7 +1323,7 @@ module.exports = { cpu_user_imbalance: { unit: "%", type: "double", - dtype: "analysis", + dtype: 'ignore', nullable: true, def: null, batchExport: true, @@ -2681,6 +2681,19 @@ module.exports = { // Include columns from this table in the raw statistics configuration. table: 'modw_supremm.job', + tables: [ + { + schema: 'modw_supremm', + name: 'job_errors', + alias: 'je', + join: { + primaryKey: '_id', + foreignTableAlias: 'jf', + foreignKey: '_id' + } + } + ], + // Fields not already defined as part of the ETL schema. fields: { timezone: { @@ -2698,6 +2711,81 @@ module.exports = { foreignKey: 'resource_id', column: 'timezone' } + }, + // Note that the code below is referenced in docs/customization.md. + homogeneity: { + name: 'Homogeneity', + formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))', + withError: { + name: 'homogeneity_error', + column: 'catastrophe', + tableAlias: 'je' + }, + unit: 'ratio', + per: 'job', + visibility: 'public', + comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. ' + + 'Jobs with a low homogeneity value (~0) should be investigated to check if there ' + + 'has been a catastrophic failure during the job', + batchExport: true, + dtype: 'analysis', + group: 'Other' + }, + cpu_user_balance: { + name: 'CPU User Balance', + formula: '(1.0 - (jf.cpu_user_imbalance/100.0))', + withError: { + name: 'cpu_user_balance_error', + column: 'cpu_user_imbalance', + tableAlias: 'je' + }, + unit: 'ratio', + per: 'job', + visibility: 'public', + comments: 'A measure of how uniform the CPU usage is between the cores that the job was ' + + 'assigned. A value of CPU User Balance near 1 corresponds to a job with evenly ' + + 'loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores ' + + 'with much lower utilization that the others.', + batchExport: true, + dtype: 'analysis', + group: 'Other' + }, + mem_coefficient: { + name: 'Memory Headroom', + formula: '(1.0 - 1.0/POW(2-jf.max_memory, 5))', + withError: { + name: 'mem_coefficient_error', + column: 'max_memory', + tableAlias: 'je' + }, + unit: 'ratio', + per: 'job', + visibility: 'public', + comments: 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds ' + + 'to a job which used all of the available memory and 1 corresponds to a job with low memory usage. ' + + 'The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for ' + + 'the compute node that had the highest memory usage.', + batchExport: true, + dtype: 'analysis', + group: 'Other' + }, + wall_accuracy: { + name: 'Walltime Accuracy', + formula: 'LEAST(jf.wall_time / jf.requested_wall_time, 1)', + withError: { + name: 'requested_wall_time_error', + column: 'requested_wall_time', + tableAlias: 'je' + }, + unit: 'ratio', + per: 'job', + visibility: 'public', + comments: 'The ratio of actual wall time to requested wall time. A value near 1 indicates that ' + + 'the requested wall time close to the actual wall time. A good wall time accuracy improves ' + + 'system wide scheduling.', + batchExport: true, + dtype: 'analysis', + group: 'Other' } } }