Implemented a health check for data source administration YAML files.

2019-08-20 11:14:07 +02:00 · 2019-08-20 11:14:07 +02:00 · 98067447c6
parent 248c6a07d8
commit 98067447c6
3 changed files with 324 additions and 11 deletions
--- a/dettect.py
+++ b/dettect.py
@ -51,6 +51,7 @@ def _init_menu():
                                                            'not updated without your approval. The updated visibility '
                                                            'scores are calculated in the same way as with the option: '
                                                            '-y, --yaml', action='store_true')
+    parser_data_sources.add_argument('--health', help='check the YAML file(s) for errors', action='store_true')

    # create the visibility parser
    parser_visibility = subparsers.add_parser('visibility', aliases=['v'],
@ -76,9 +77,9 @@ def _init_menu():
                                   action='store_true')
    parser_visibility.add_argument('-o', '--overlay', help='generate a visibility layer overlaid with detections for '
                                                           'the ATT&CK navigator', action='store_true')
-    parser_visibility.add_argument('-g', '--graph', help='generate a graph with visibility items added through time',
+    parser_visibility.add_argument('-g', '--graph', help='generate a graph with visibility added through time',
                                   action='store_true')
-    parser_visibility.add_argument('--health', help='check the technique YAML file for errors', action='store_true')
+    parser_visibility.add_argument('--health', help='check the YAML file for errors', action='store_true')

    # create the detection parser
    parser_detection = subparsers.add_parser('detection', aliases=['d'],
@ -106,9 +107,9 @@ def _init_menu():
                                  action='store_true')
    parser_detection.add_argument('-o', '--overlay', help='generate a detection layer overlaid with visibility for '
                                                          'the ATT&CK navigator', action='store_true')
-    parser_detection.add_argument('-g', '--graph', help='generate a graph with detection items added through time',
+    parser_detection.add_argument('-g', '--graph', help='generate a graph with detections added through time',
                                  action='store_true')
-    parser_detection.add_argument('--health', help='check the technique YAML file for errors', action='store_true')
+    parser_detection.add_argument('--health', help='check the YAML file(s) for errors', action='store_true')

    # create the group parser
    parser_group = subparsers.add_parser('group', aliases=['g'],
@ -145,7 +146,7 @@ def _init_menu():
                                                   'the EQL search. The default behaviour is to only include the '
                                                   'most recent \'score\' objects',
                              action='store_true', default=False)
-    parser_group.add_argument('--health', help='check the technique YAML file for errors', action='store_true')
+    parser_group.add_argument('--health', help='check the YAML file(s) for errors', action='store_true')

    # create the generic parser
    parser_generic = subparsers.add_parser('generic', description='Generic functions which will output to stdout.',
@ -180,14 +181,14 @@ def _menu(menu_parser):
        interactive_menu()

    elif args.subparser in ['datasource', 'ds']:
-        if check_file(args.file_ds, FILE_TYPE_DATA_SOURCE_ADMINISTRATION):
+        if check_file(args.file_ds, FILE_TYPE_DATA_SOURCE_ADMINISTRATION, args.health):
            file_ds = args.file_ds

            if args.search:
                file_ds = search(args.file_ds, FILE_TYPE_DATA_SOURCE_ADMINISTRATION, args.search)
                if not file_ds:
                    quit()  # something went wrong in executing the search or 0 results where returned
-            if args.update and check_file(args.file_tech, FILE_TYPE_TECHNIQUE_ADMINISTRATION):
+            if args.update and check_file(args.file_tech, FILE_TYPE_TECHNIQUE_ADMINISTRATION, args.health):
                update_technique_administration_file(file_ds, args.file_tech)
            if args.layer:
                generate_data_sources_layer(file_ds)
--- a/health.py
+++ b/health.py
@ -0,0 +1,307 @@
+import os
+import pickle
+from difflib import SequenceMatcher
+from constants import *
+
+
+def _print_error_msg(msg, print_error):
+    if print_error:
+        print(msg)
+    return True
+
+
+def _update_health_state(current, update):
+    if current or update:
+        return True
+    else:
+        return update
+
+
+def _is_file_modified(filename):
+    """
+    Check if the provided file was modified since the last check
+    :param filename: file location
+    :return: true when modified else false
+    """
+    last_modified_file = 'cache/last-modified_' + os.path.basename(filename).rstrip('.yaml')
+
+    def _update_modified_date(date):
+        with open(last_modified_file, 'wb') as fd:
+            pickle.dump(date, fd)
+
+    if not os.path.exists(last_modified_file):
+        last_modified = os.path.getmtime(filename)
+        _update_modified_date(last_modified)
+
+        return True
+    else:
+        with open(last_modified_file, 'rb') as f:
+            last_modified_cache = pickle.load(f)
+            last_modified_current = os.path.getmtime(filename)
+
+            if last_modified_cache != last_modified_current:
+                _update_modified_date(last_modified_current)
+                return True
+            else:
+                return False
+
+
+def _get_health_state_cache(filename):
+    """
+    Get file health state from disk
+    :param filename: file location
+    :return: the cached error state
+    """
+    last_error_file = 'cache/last-error-state_' + os.path.basename(filename).rstrip('.yaml')
+
+    if os.path.exists(last_error_file):
+        with open(last_error_file, 'rb') as f:
+            last_error_state_cache = pickle.load(f)
+
+        return last_error_state_cache
+
+
+def _update_health_state_cache(filename, has_error):
+    """
+    Write the file health state to disk if changed
+    :param filename: file location
+    """
+    # the function 'check_health_data_sources' will call this function without providing a filename when
+    # 'check_health_data_sources' is called from '_events_to_yaml' within 'eql_yaml.py'
+    if filename:
+        last_error_file = 'cache/last-error-state_' + os.path.basename(filename).rstrip('.yaml')
+
+        def _update(error):
+            with open(last_error_file, 'wb') as fd:
+                pickle.dump(error, fd)
+
+        if not os.path.exists(last_error_file):
+            _update(has_error)
+        else:
+            error_state_cache = _get_health_state_cache(filename)
+            if error_state_cache != has_error:
+                _update(has_error)
+
+
+def check_health_data_sources(filename, ds_content, health_is_called, no_print=False):
+    """
+    Check on errors in the provided data sources administration YAML file.
+    :param filename: YAML file location
+    :param ds_content: content of the YAML file in a list of dicts
+    :param health_is_called: boolean that specifies if detailed errors in the file will be printed to stdout
+    :param no_print: specifies if the non-detailed error message is printed to stdout or not
+    :return: False if no errors have been found, otherwise True
+    """
+    has_error = False
+
+    for ds in ds_content['data_sources']:
+        # check for missing keys
+        for key in ['data_source_name', 'date_registered', 'date_connected', 'products', 'available_for_data_analytics', 'comment', 'data_quality']:
+            if key not in ds:
+                has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' is MISSING a key-value pair: ' + key, health_is_called)
+
+        for key in ['date_registered', 'date_connected']:
+            if key in ds and not ds[key] is None:
+                try:
+                    # noinspection PyStatementEffect
+                    ds[key].year
+                    # noinspection PyStatementEffect
+                    ds[key].month
+                    # noinspection PyStatementEffect
+                    ds[key].day
+                except AttributeError:
+                    has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' has an INVALID data format for the dimension \'' + dimension
+                                                 + '\': ' + ds[key] + '  (should be YYYY-MM-DD without quotes)', health_is_called)
+
+        if 'available_for_data_analytics' in ds:
+            if not isinstance(ds['available_for_data_analytics'], bool):
+                has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' has an INVALID \'available_for_data_analytics\' value: should be set to \'true\' or \'false\'', health_is_called)
+
+        if 'data_quality' in ds:
+            if isinstance(ds['data_quality'], dict):
+                for dimension in ['device_completeness', 'data_field_completeness', 'timeliness', 'consistency', 'retention']:
+                    if dimension not in ds['data_quality']:
+                        has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' is MISSING a key-value pair in \'data_quality\': ' + dimension, health_is_called)
+                    else:
+                        if isinstance(ds['data_quality'][dimension], int):
+                            if not 0 <= ds['data_quality'][dimension] <= 5:
+                                has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' has an INVALID data quality score for the dimension \''
+                                                             + dimension + '\': ' + str(ds['data_quality'][dimension]) + '  (should be between 0 and 5)', health_is_called)
+                        else:
+                            has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' has an INVALID data quality score for the dimension \'' +
+                                                         dimension + '\': ' + str(ds['data_quality'][dimension]) + '  (should be an an integer)', health_is_called)
+            else:
+                has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' the key-value pair \'data_quality\' is NOT a dictionary with data quality dimension scores', health_is_called)
+
+    if has_error and not health_is_called and not no_print:
+        print(HEALTH_ERROR_TXT + filename)
+
+    _update_health_state_cache(filename, has_error)
+
+    return has_error
+
+
+def _check_health_score_object(yaml_object, object_type, tech_id, health_is_called):
+    """
+    Check the health of a score_logbook inside a visibility or detection YAML object
+    :param yaml_object: YAML file lines
+    :param object_type: 'detection' or 'visibility'
+    :param tech_id: ATT&CK technique ID
+    :param health_is_called: boolean that specifies if detailed errors in the file will be printed
+    :return: True if the YAML file is unhealthy, otherwise False
+    """
+    has_error = False
+    min_score = None
+    max_score = None
+
+    if object_type == 'detection':
+        min_score = -1
+        max_score = 5
+    elif object_type == 'visibility':
+        min_score = 0
+        max_score = 4
+
+    if not isinstance(yaml_object['score_logbook'], list):
+        yaml_object['score_logbook'] = [yaml_object['score_logbook']]
+
+    try:
+        for score_obj in yaml_object['score_logbook']:
+            for key in ['date', 'score', 'comment']:
+                if key not in score_obj:
+                    has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' is MISSING a key-value pair in a ' + object_type + ' score object within the \'score_logbook\': ' + key, health_is_called)
+
+            if score_obj['score'] is None:
+                has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' has an EMPTY key-value pair in a ' + object_type + ' score object within the \'score_logbook\': score', health_is_called)
+
+            elif not isinstance(score_obj['score'], int):
+                has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' has an INVALID score format in a ' + object_type + ' score object within the \'score_logbook\': ' + score_obj['score'] + '  (should be an integer)', health_is_called)
+
+            if 'auto_generated' in score_obj:
+                if not isinstance(score_obj['auto_generated'], bool):
+                    has_error = _print_error_msg(
+                        '[!] Technique ID: ' + tech_id + ' has an INVALID \'auto_generated\' value in a ' + object_type + ' score object within the \'score_logbook\': should be set to \'true\' or \'false\'', health_is_called)
+
+            if isinstance(score_obj['score'], int):
+                if score_obj['date'] is None and score_obj['score'] > -1:
+                    has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' has an EMPTY key-value pair in a ' + object_type + ' score object within the \'score_logbook\': date', health_is_called)
+
+                # noinspection PyChainedComparisons
+                if not (score_obj['score'] >= min_score and score_obj['score'] <= max_score):
+                    has_error = _print_error_msg(
+                        '[!] Technique ID: ' + tech_id + ' has an INVALID ' + object_type + ' score in a score object within the \'score_logbook\': ' + str(score_obj['score']) + '  (should be between ' + str(min_score) + ' and ' + str(max_score) + ')', health_is_called)
+
+                if not score_obj['date'] is None:
+                    try:
+                        # noinspection PyStatementEffect
+                        score_obj['date'].year
+                        # noinspection PyStatementEffect
+                        score_obj['date'].month
+                        # noinspection PyStatementEffect
+                        score_obj['date'].day
+                    except AttributeError:
+                        has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' has an INVALID data format in a ' + object_type + ' score object within the \'score_logbook\': ' + score_obj['date'] + ' (should be YYYY-MM-DD without quotes)', health_is_called)
+    except KeyError:
+        pass
+
+    return has_error
+
+
+def _check_health_techniques(filename, technique_content, health_is_called):
+    """
+    Check on errors in the provided technique administration YAML file.
+    :param filename: YAML file location
+    :param technique_content: content of the YAML file in a list of dicts
+    :param health_is_called: boolean that specifies if detailed errors in the file will be printed to stdout
+    :return:
+    """
+    from generic import load_techniques
+
+    has_error = False
+
+    # create a list of ATT&CK technique IDs and check for duplicates
+    tech_ids = list(map(lambda x: x['technique_id'], technique_content['techniques']))
+    tech_dup = set()
+    for tech in tech_ids:
+        if tech not in tech_dup:
+            tech_dup.add(tech)
+        else:
+            has_error = _print_error_msg('[!] Duplicate technique ID: ' + tech, health_is_called)
+
+        # check if the technique has a valid format
+        if not REGEX_YAML_TECHNIQUE_ID_FORMAT.match(tech):
+            has_error = _print_error_msg('[!] Invalid technique ID: ' + tech, health_is_called)
+
+    all_applicable_to = set()
+
+    techniques = load_techniques(filename)
+    for tech, v in techniques[0].items():
+        for obj_type in ['detection', 'visibility']:
+            if obj_type not in v:
+                has_error = _print_error_msg('[!] Technique ID: ' + tech + ' is MISSING a key-value pair: ' + obj_type, health_is_called)
+            else:
+                for obj in v[obj_type]:
+                    obj_keys = ['applicable_to', 'comment', 'score_logbook']
+                    obj_keys_list = ['applicable_to']
+                    if obj_type == 'detection':
+                        obj_keys.append('location')
+                        obj_keys_list.append('location')
+
+                    for okey in obj_keys:
+                        if okey not in obj:
+                            has_error = _print_error_msg('[!] Technique ID: ' + tech + ' is MISSING a key-value pair in \'' + obj_type + '\': ' + okey, health_is_called)
+
+                    for okey in obj_keys_list:
+                        if okey in obj:
+                            if not isinstance(obj[okey], list):
+                                has_error = _print_error_msg('[!] Technique ID: ' + tech + ' the key-value pair \'' + okey + '\' in \'' + obj_type + '\' is NOT a list', health_is_called)
+
+                    health = _check_health_score_object(obj, obj_type, tech, health_is_called)
+                    has_error = _update_health_state(has_error, health)
+
+                    if 'applicable_to' in obj and isinstance(obj['applicable_to'], list):
+                        all_applicable_to.update(obj['applicable_to'])
+
+    # get values within the key-value pair 'applicable_to' and 'location' which are a very close match
+    similar = set()
+    for i1 in all_applicable_to:
+        for i2 in all_applicable_to:
+            match_value = SequenceMatcher(None, i1, i2).ratio()
+            if match_value > 0.8 and match_value != 1:
+                similar.add(i1)
+                similar.add(i2)
+
+    if len(similar) > 0:
+        has_error = _print_error_msg('[!] There are values in the key-value pairs for \'applicable_to\' which are very similar. Correct where necessary:', health_is_called)
+        for s in similar:
+            _print_error_msg('    - ' + s, health_is_called)
+
+    if has_error and not health_is_called:
+        print(HEALTH_ERROR_TXT + filename)
+
+    _update_health_state_cache(filename, has_error)
+
+
+def check_yaml_file_health(filename, file_type, health_is_called):
+    """
+    Check on errors in the provided YAML file.
+    :param filename: YAML file location
+    :param file_type: currently FILE_TYPE_TECHNIQUE_ADMINISTRATION and FILE_TYPE_DATA_SOURCE_ADMINISTRATION is supported
+    :param health_is_called: boolean that specifies if detailed errors in the file will be printed to stdout
+    :return:
+    """
+    from generic import init_yaml
+
+    # first we check if the file was modified. Otherwise, the health check is skipped for performance reasons
+    if _is_file_modified(filename) or health_is_called:
+
+        _yaml = init_yaml()
+        with open(filename, 'r') as yaml_file:
+            yaml_content = _yaml.load(yaml_file)
+
+        if file_type == FILE_TYPE_DATA_SOURCE_ADMINISTRATION:
+            check_health_data_sources(filename, yaml_content, health_is_called)
+        elif file_type == FILE_TYPE_TECHNIQUE_ADMINISTRATION:
+            _check_health_techniques(filename, yaml_content, health_is_called)
+
+    elif _get_health_state_cache(filename):
+        print(HEALTH_ERROR_TXT + filename)
--- a/interactive_menu.py
+++ b/interactive_menu.py
@ -251,6 +251,7 @@ def _menu_data_source(filename_ds):
    print('6. update the visibility scores within a technique administration YAML file based on changes within any of '
          'the data sources. \nPast visibility scores are preserved in the score_logbook, and manually assigned scores are '
          'not updated without your approval. \nThe updated visibility are based on the number of available data sources.')
+    print('7. Check the data sources YAML file for errors.')
    print('9. Back to main menu.')
    choice = _ask_input()
    if choice == '1':
@ -287,6 +288,10 @@ def _menu_data_source(filename_ds):
        print('Updating visibility scores...')
        update_technique_administration_file(filename_ds, filename_t)
        _wait()
+    elif choice == '7':
+        print('Checking the data source YAML for errors...')
+        check_yaml_file_health(filename_ds, FILE_TYPE_DATA_SOURCE_ADMINISTRATION, health_is_called=True)
+        _wait()
    elif choice == '9':
        interactive_menu()
    elif choice == 'q':
@ -318,7 +323,7 @@ def _menu_detection(filename_t):
    print('Select what you want to do:')
    print('4. Generate a layer for detection coverage for the ATT&CK Navigator.')
    print('5. Generate a layer for detection coverage overlaid with visibility for the ATT&CK Navigator.')
-    print('6. Generate a graph with detection items added through time.')
+    print('6. Generate a graph with detections added through time.')
    print('7. Generate an Excel sheet with all administrated techniques.')
    print('8. Check the technique YAML file for errors.')
    print('9. Back to main menu.')
@ -360,7 +365,7 @@ def _menu_detection(filename_t):
            print('Generating Excel file...')
            export_techniques_list_to_excel(file_tech)
            _wait()
-    elif choice == '8`x':
+    elif choice == '8':
        print('Checking the technique YAML file for errors...')
        check_yaml_file_health(filename_t, FILE_TYPE_TECHNIQUE_ADMINISTRATION, health_is_called=True)
        _wait()
@ -397,7 +402,7 @@ def _menu_visibility(filename_t, filename_ds):
    print('Select what you want to do:')
    print('4. Generate a layer for visibility for the ATT&CK Navigator.')
    print('5. Generate a layer for visibility overlaid with detection coverage for the ATT&CK Navigator.')
-    print('6. Generate a graph with visibility items added through time.')
+    print('6. Generate a graph with visibility added through time.')
    print('7. Generate an Excel sheet with all administrated techniques.')
    print('8. Check the technique YAML file for errors.')
    print('9. Back to main menu.')