Implemented a health check for data source administration YAML files.

master
Marcus Bakker 2019-08-20 11:14:07 +02:00
parent 248c6a07d8
commit 98067447c6
3 changed files with 324 additions and 11 deletions

View File

@ -51,6 +51,7 @@ def _init_menu():
'not updated without your approval. The updated visibility '
'scores are calculated in the same way as with the option: '
'-y, --yaml', action='store_true')
parser_data_sources.add_argument('--health', help='check the YAML file(s) for errors', action='store_true')
# create the visibility parser
parser_visibility = subparsers.add_parser('visibility', aliases=['v'],
@ -76,9 +77,9 @@ def _init_menu():
action='store_true')
parser_visibility.add_argument('-o', '--overlay', help='generate a visibility layer overlaid with detections for '
'the ATT&CK navigator', action='store_true')
parser_visibility.add_argument('-g', '--graph', help='generate a graph with visibility items added through time',
action='store_true')
parser_visibility.add_argument('--health', help='check the technique YAML file for errors', action='store_true')
parser_visibility.add_argument('-g', '--graph', help='generate a graph with visibility added through time',
action='store_true')
parser_visibility.add_argument('--health', help='check the YAML file for errors', action='store_true')
# create the detection parser
parser_detection = subparsers.add_parser('detection', aliases=['d'],
@ -106,9 +107,9 @@ def _init_menu():
action='store_true')
parser_detection.add_argument('-o', '--overlay', help='generate a detection layer overlaid with visibility for '
'the ATT&CK navigator', action='store_true')
parser_detection.add_argument('-g', '--graph', help='generate a graph with detection items added through time',
parser_detection.add_argument('-g', '--graph', help='generate a graph with detections added through time',
action='store_true')
parser_detection.add_argument('--health', help='check the technique YAML file for errors', action='store_true')
parser_detection.add_argument('--health', help='check the YAML file(s) for errors', action='store_true')
# create the group parser
parser_group = subparsers.add_parser('group', aliases=['g'],
@ -145,7 +146,7 @@ def _init_menu():
'the EQL search. The default behaviour is to only include the '
'most recent \'score\' objects',
action='store_true', default=False)
parser_group.add_argument('--health', help='check the technique YAML file for errors', action='store_true')
parser_group.add_argument('--health', help='check the YAML file(s) for errors', action='store_true')
# create the generic parser
parser_generic = subparsers.add_parser('generic', description='Generic functions which will output to stdout.',
@ -180,14 +181,14 @@ def _menu(menu_parser):
interactive_menu()
elif args.subparser in ['datasource', 'ds']:
if check_file(args.file_ds, FILE_TYPE_DATA_SOURCE_ADMINISTRATION):
if check_file(args.file_ds, FILE_TYPE_DATA_SOURCE_ADMINISTRATION, args.health):
file_ds = args.file_ds
if args.search:
file_ds = search(args.file_ds, FILE_TYPE_DATA_SOURCE_ADMINISTRATION, args.search)
if not file_ds:
quit() # something went wrong in executing the search or 0 results where returned
if args.update and check_file(args.file_tech, FILE_TYPE_TECHNIQUE_ADMINISTRATION):
if args.update and check_file(args.file_tech, FILE_TYPE_TECHNIQUE_ADMINISTRATION, args.health):
update_technique_administration_file(file_ds, args.file_tech)
if args.layer:
generate_data_sources_layer(file_ds)

307
health.py Normal file
View File

@ -0,0 +1,307 @@
import os
import pickle
from difflib import SequenceMatcher
from constants import *
def _print_error_msg(msg, print_error):
if print_error:
print(msg)
return True
def _update_health_state(current, update):
if current or update:
return True
else:
return update
def _is_file_modified(filename):
"""
Check if the provided file was modified since the last check
:param filename: file location
:return: true when modified else false
"""
last_modified_file = 'cache/last-modified_' + os.path.basename(filename).rstrip('.yaml')
def _update_modified_date(date):
with open(last_modified_file, 'wb') as fd:
pickle.dump(date, fd)
if not os.path.exists(last_modified_file):
last_modified = os.path.getmtime(filename)
_update_modified_date(last_modified)
return True
else:
with open(last_modified_file, 'rb') as f:
last_modified_cache = pickle.load(f)
last_modified_current = os.path.getmtime(filename)
if last_modified_cache != last_modified_current:
_update_modified_date(last_modified_current)
return True
else:
return False
def _get_health_state_cache(filename):
"""
Get file health state from disk
:param filename: file location
:return: the cached error state
"""
last_error_file = 'cache/last-error-state_' + os.path.basename(filename).rstrip('.yaml')
if os.path.exists(last_error_file):
with open(last_error_file, 'rb') as f:
last_error_state_cache = pickle.load(f)
return last_error_state_cache
def _update_health_state_cache(filename, has_error):
"""
Write the file health state to disk if changed
:param filename: file location
"""
# the function 'check_health_data_sources' will call this function without providing a filename when
# 'check_health_data_sources' is called from '_events_to_yaml' within 'eql_yaml.py'
if filename:
last_error_file = 'cache/last-error-state_' + os.path.basename(filename).rstrip('.yaml')
def _update(error):
with open(last_error_file, 'wb') as fd:
pickle.dump(error, fd)
if not os.path.exists(last_error_file):
_update(has_error)
else:
error_state_cache = _get_health_state_cache(filename)
if error_state_cache != has_error:
_update(has_error)
def check_health_data_sources(filename, ds_content, health_is_called, no_print=False):
"""
Check on errors in the provided data sources administration YAML file.
:param filename: YAML file location
:param ds_content: content of the YAML file in a list of dicts
:param health_is_called: boolean that specifies if detailed errors in the file will be printed to stdout
:param no_print: specifies if the non-detailed error message is printed to stdout or not
:return: False if no errors have been found, otherwise True
"""
has_error = False
for ds in ds_content['data_sources']:
# check for missing keys
for key in ['data_source_name', 'date_registered', 'date_connected', 'products', 'available_for_data_analytics', 'comment', 'data_quality']:
if key not in ds:
has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' is MISSING a key-value pair: ' + key, health_is_called)
for key in ['date_registered', 'date_connected']:
if key in ds and not ds[key] is None:
try:
# noinspection PyStatementEffect
ds[key].year
# noinspection PyStatementEffect
ds[key].month
# noinspection PyStatementEffect
ds[key].day
except AttributeError:
has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' has an INVALID data format for the dimension \'' + dimension
+ '\': ' + ds[key] + ' (should be YYYY-MM-DD without quotes)', health_is_called)
if 'available_for_data_analytics' in ds:
if not isinstance(ds['available_for_data_analytics'], bool):
has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' has an INVALID \'available_for_data_analytics\' value: should be set to \'true\' or \'false\'', health_is_called)
if 'data_quality' in ds:
if isinstance(ds['data_quality'], dict):
for dimension in ['device_completeness', 'data_field_completeness', 'timeliness', 'consistency', 'retention']:
if dimension not in ds['data_quality']:
has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' is MISSING a key-value pair in \'data_quality\': ' + dimension, health_is_called)
else:
if isinstance(ds['data_quality'][dimension], int):
if not 0 <= ds['data_quality'][dimension] <= 5:
has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' has an INVALID data quality score for the dimension \''
+ dimension + '\': ' + str(ds['data_quality'][dimension]) + ' (should be between 0 and 5)', health_is_called)
else:
has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' has an INVALID data quality score for the dimension \'' +
dimension + '\': ' + str(ds['data_quality'][dimension]) + ' (should be an an integer)', health_is_called)
else:
has_error = _print_error_msg('[!] Data source: \'' + ds['data_source_name'] + '\' the key-value pair \'data_quality\' is NOT a dictionary with data quality dimension scores', health_is_called)
if has_error and not health_is_called and not no_print:
print(HEALTH_ERROR_TXT + filename)
_update_health_state_cache(filename, has_error)
return has_error
def _check_health_score_object(yaml_object, object_type, tech_id, health_is_called):
"""
Check the health of a score_logbook inside a visibility or detection YAML object
:param yaml_object: YAML file lines
:param object_type: 'detection' or 'visibility'
:param tech_id: ATT&CK technique ID
:param health_is_called: boolean that specifies if detailed errors in the file will be printed
:return: True if the YAML file is unhealthy, otherwise False
"""
has_error = False
min_score = None
max_score = None
if object_type == 'detection':
min_score = -1
max_score = 5
elif object_type == 'visibility':
min_score = 0
max_score = 4
if not isinstance(yaml_object['score_logbook'], list):
yaml_object['score_logbook'] = [yaml_object['score_logbook']]
try:
for score_obj in yaml_object['score_logbook']:
for key in ['date', 'score', 'comment']:
if key not in score_obj:
has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' is MISSING a key-value pair in a ' + object_type + ' score object within the \'score_logbook\': ' + key, health_is_called)
if score_obj['score'] is None:
has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' has an EMPTY key-value pair in a ' + object_type + ' score object within the \'score_logbook\': score', health_is_called)
elif not isinstance(score_obj['score'], int):
has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' has an INVALID score format in a ' + object_type + ' score object within the \'score_logbook\': ' + score_obj['score'] + ' (should be an integer)', health_is_called)
if 'auto_generated' in score_obj:
if not isinstance(score_obj['auto_generated'], bool):
has_error = _print_error_msg(
'[!] Technique ID: ' + tech_id + ' has an INVALID \'auto_generated\' value in a ' + object_type + ' score object within the \'score_logbook\': should be set to \'true\' or \'false\'', health_is_called)
if isinstance(score_obj['score'], int):
if score_obj['date'] is None and score_obj['score'] > -1:
has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' has an EMPTY key-value pair in a ' + object_type + ' score object within the \'score_logbook\': date', health_is_called)
# noinspection PyChainedComparisons
if not (score_obj['score'] >= min_score and score_obj['score'] <= max_score):
has_error = _print_error_msg(
'[!] Technique ID: ' + tech_id + ' has an INVALID ' + object_type + ' score in a score object within the \'score_logbook\': ' + str(score_obj['score']) + ' (should be between ' + str(min_score) + ' and ' + str(max_score) + ')', health_is_called)
if not score_obj['date'] is None:
try:
# noinspection PyStatementEffect
score_obj['date'].year
# noinspection PyStatementEffect
score_obj['date'].month
# noinspection PyStatementEffect
score_obj['date'].day
except AttributeError:
has_error = _print_error_msg('[!] Technique ID: ' + tech_id + ' has an INVALID data format in a ' + object_type + ' score object within the \'score_logbook\': ' + score_obj['date'] + ' (should be YYYY-MM-DD without quotes)', health_is_called)
except KeyError:
pass
return has_error
def _check_health_techniques(filename, technique_content, health_is_called):
"""
Check on errors in the provided technique administration YAML file.
:param filename: YAML file location
:param technique_content: content of the YAML file in a list of dicts
:param health_is_called: boolean that specifies if detailed errors in the file will be printed to stdout
:return:
"""
from generic import load_techniques
has_error = False
# create a list of ATT&CK technique IDs and check for duplicates
tech_ids = list(map(lambda x: x['technique_id'], technique_content['techniques']))
tech_dup = set()
for tech in tech_ids:
if tech not in tech_dup:
tech_dup.add(tech)
else:
has_error = _print_error_msg('[!] Duplicate technique ID: ' + tech, health_is_called)
# check if the technique has a valid format
if not REGEX_YAML_TECHNIQUE_ID_FORMAT.match(tech):
has_error = _print_error_msg('[!] Invalid technique ID: ' + tech, health_is_called)
all_applicable_to = set()
techniques = load_techniques(filename)
for tech, v in techniques[0].items():
for obj_type in ['detection', 'visibility']:
if obj_type not in v:
has_error = _print_error_msg('[!] Technique ID: ' + tech + ' is MISSING a key-value pair: ' + obj_type, health_is_called)
else:
for obj in v[obj_type]:
obj_keys = ['applicable_to', 'comment', 'score_logbook']
obj_keys_list = ['applicable_to']
if obj_type == 'detection':
obj_keys.append('location')
obj_keys_list.append('location')
for okey in obj_keys:
if okey not in obj:
has_error = _print_error_msg('[!] Technique ID: ' + tech + ' is MISSING a key-value pair in \'' + obj_type + '\': ' + okey, health_is_called)
for okey in obj_keys_list:
if okey in obj:
if not isinstance(obj[okey], list):
has_error = _print_error_msg('[!] Technique ID: ' + tech + ' the key-value pair \'' + okey + '\' in \'' + obj_type + '\' is NOT a list', health_is_called)
health = _check_health_score_object(obj, obj_type, tech, health_is_called)
has_error = _update_health_state(has_error, health)
if 'applicable_to' in obj and isinstance(obj['applicable_to'], list):
all_applicable_to.update(obj['applicable_to'])
# get values within the key-value pair 'applicable_to' and 'location' which are a very close match
similar = set()
for i1 in all_applicable_to:
for i2 in all_applicable_to:
match_value = SequenceMatcher(None, i1, i2).ratio()
if match_value > 0.8 and match_value != 1:
similar.add(i1)
similar.add(i2)
if len(similar) > 0:
has_error = _print_error_msg('[!] There are values in the key-value pairs for \'applicable_to\' which are very similar. Correct where necessary:', health_is_called)
for s in similar:
_print_error_msg(' - ' + s, health_is_called)
if has_error and not health_is_called:
print(HEALTH_ERROR_TXT + filename)
_update_health_state_cache(filename, has_error)
def check_yaml_file_health(filename, file_type, health_is_called):
"""
Check on errors in the provided YAML file.
:param filename: YAML file location
:param file_type: currently FILE_TYPE_TECHNIQUE_ADMINISTRATION and FILE_TYPE_DATA_SOURCE_ADMINISTRATION is supported
:param health_is_called: boolean that specifies if detailed errors in the file will be printed to stdout
:return:
"""
from generic import init_yaml
# first we check if the file was modified. Otherwise, the health check is skipped for performance reasons
if _is_file_modified(filename) or health_is_called:
_yaml = init_yaml()
with open(filename, 'r') as yaml_file:
yaml_content = _yaml.load(yaml_file)
if file_type == FILE_TYPE_DATA_SOURCE_ADMINISTRATION:
check_health_data_sources(filename, yaml_content, health_is_called)
elif file_type == FILE_TYPE_TECHNIQUE_ADMINISTRATION:
_check_health_techniques(filename, yaml_content, health_is_called)
elif _get_health_state_cache(filename):
print(HEALTH_ERROR_TXT + filename)

View File

@ -251,6 +251,7 @@ def _menu_data_source(filename_ds):
print('6. update the visibility scores within a technique administration YAML file based on changes within any of '
'the data sources. \nPast visibility scores are preserved in the score_logbook, and manually assigned scores are '
'not updated without your approval. \nThe updated visibility are based on the number of available data sources.')
print('7. Check the data sources YAML file for errors.')
print('9. Back to main menu.')
choice = _ask_input()
if choice == '1':
@ -287,6 +288,10 @@ def _menu_data_source(filename_ds):
print('Updating visibility scores...')
update_technique_administration_file(filename_ds, filename_t)
_wait()
elif choice == '7':
print('Checking the data source YAML for errors...')
check_yaml_file_health(filename_ds, FILE_TYPE_DATA_SOURCE_ADMINISTRATION, health_is_called=True)
_wait()
elif choice == '9':
interactive_menu()
elif choice == 'q':
@ -318,7 +323,7 @@ def _menu_detection(filename_t):
print('Select what you want to do:')
print('4. Generate a layer for detection coverage for the ATT&CK Navigator.')
print('5. Generate a layer for detection coverage overlaid with visibility for the ATT&CK Navigator.')
print('6. Generate a graph with detection items added through time.')
print('6. Generate a graph with detections added through time.')
print('7. Generate an Excel sheet with all administrated techniques.')
print('8. Check the technique YAML file for errors.')
print('9. Back to main menu.')
@ -360,7 +365,7 @@ def _menu_detection(filename_t):
print('Generating Excel file...')
export_techniques_list_to_excel(file_tech)
_wait()
elif choice == '8`x':
elif choice == '8':
print('Checking the technique YAML file for errors...')
check_yaml_file_health(filename_t, FILE_TYPE_TECHNIQUE_ADMINISTRATION, health_is_called=True)
_wait()
@ -397,7 +402,7 @@ def _menu_visibility(filename_t, filename_ds):
print('Select what you want to do:')
print('4. Generate a layer for visibility for the ATT&CK Navigator.')
print('5. Generate a layer for visibility overlaid with detection coverage for the ATT&CK Navigator.')
print('6. Generate a graph with visibility items added through time.')
print('6. Generate a graph with visibility added through time.')
print('7. Generate an Excel sheet with all administrated techniques.')
print('8. Check the technique YAML file for errors.')
print('9. Back to main menu.')