2019-08-20 09:14:07 +00:00
import os
import pickle
from difflib import SequenceMatcher
from constants import *
def _print_error_msg ( msg , print_error ) :
if print_error :
print ( msg )
return True
def _update_health_state ( current , update ) :
if current or update :
return True
else :
return update
def _is_file_modified ( filename ) :
"""
Check if the provided file was modified since the last check
: param filename : file location
: return : true when modified else false
"""
last_modified_file = ' cache/last-modified_ ' + os . path . basename ( filename ) . rstrip ( ' .yaml ' )
def _update_modified_date ( date ) :
with open ( last_modified_file , ' wb ' ) as fd :
pickle . dump ( date , fd )
if not os . path . exists ( last_modified_file ) :
last_modified = os . path . getmtime ( filename )
_update_modified_date ( last_modified )
return True
else :
with open ( last_modified_file , ' rb ' ) as f :
last_modified_cache = pickle . load ( f )
last_modified_current = os . path . getmtime ( filename )
if last_modified_cache != last_modified_current :
_update_modified_date ( last_modified_current )
return True
else :
return False
def _get_health_state_cache ( filename ) :
"""
Get file health state from disk
: param filename : file location
: return : the cached error state
"""
last_error_file = ' cache/last-error-state_ ' + os . path . basename ( filename ) . rstrip ( ' .yaml ' )
if os . path . exists ( last_error_file ) :
with open ( last_error_file , ' rb ' ) as f :
last_error_state_cache = pickle . load ( f )
return last_error_state_cache
def _update_health_state_cache ( filename , has_error ) :
"""
Write the file health state to disk if changed
: param filename : file location
"""
# the function 'check_health_data_sources' will call this function without providing a filename when
# 'check_health_data_sources' is called from '_events_to_yaml' within 'eql_yaml.py'
if filename :
last_error_file = ' cache/last-error-state_ ' + os . path . basename ( filename ) . rstrip ( ' .yaml ' )
def _update ( error ) :
with open ( last_error_file , ' wb ' ) as fd :
pickle . dump ( error , fd )
if not os . path . exists ( last_error_file ) :
_update ( has_error )
else :
error_state_cache = _get_health_state_cache ( filename )
if error_state_cache != has_error :
_update ( has_error )
2019-12-05 09:43:07 +00:00
def check_health_data_sources ( filename , ds_content , health_is_called , no_print = False , src_eql = False ) :
2019-08-20 09:14:07 +00:00
"""
Check on errors in the provided data sources administration YAML file .
: param filename : YAML file location
: param ds_content : content of the YAML file in a list of dicts
: param health_is_called : boolean that specifies if detailed errors in the file will be printed to stdout
: param no_print : specifies if the non - detailed error message is printed to stdout or not
2019-12-05 09:43:07 +00:00
: param src_eql : if True , skip certain checks that can fail because EQL filtered out some data source and the
ATT & CK Platform is not part of the EQL search result
2019-08-20 09:14:07 +00:00
: return : False if no errors have been found , otherwise True
"""
2020-02-10 11:17:00 +00:00
from generic import get_applicable_data_sources_platform
2019-08-20 09:14:07 +00:00
has_error = False
2019-11-04 13:48:58 +00:00
platform = ds_content . get ( ' platform ' , None )
2019-12-05 09:43:07 +00:00
if not src_eql :
2019-11-19 09:10:15 +00:00
if platform != ' all ' and platform != [ ' all ' ] :
if isinstance ( platform , str ) :
platform = [ platform ]
if platform is None or len ( platform ) == 0 or platform == ' ' :
platform = [ ' empty ' ]
for p in platform :
if p . lower ( ) not in PLATFORMS . keys ( ) :
has_error = _print_error_msg (
' [!] EMPTY or INVALID value for \' platform \' within the data source admin. '
' file: %s (should be value(s) of: [ %s ] or all) ' % ( p , ' , ' . join ( list ( PLATFORMS . values ( ) ) ) ) ,
health_is_called )
2019-10-17 11:51:06 +00:00
2019-12-05 09:43:07 +00:00
ds_list = [ kv [ ' data_source_name ' ] . lower ( ) for kv in ds_content [ ' data_sources ' ] ]
2020-03-03 13:49:19 +00:00
# For using the platform variable, we need first-letter-capital values and we don't need the 'empty' value from the check above.
valid_platform_list = [ ]
for p in platform :
2020-03-04 09:45:06 +00:00
if p . lower ( ) in PLATFORMS . keys ( ) :
valid_platform_list . append ( PLATFORMS [ p . lower ( ) ] )
2020-03-03 13:49:19 +00:00
applicable_data_sources = get_applicable_data_sources_platform ( valid_platform_list )
2020-02-10 11:17:00 +00:00
for ds in applicable_data_sources :
2019-12-05 09:43:07 +00:00
if ds . lower ( ) not in ds_list :
has_error = _print_error_msg ( ' [!] Data source: \' ' + ds + ' \' is MISSING from the YAML file ' , health_is_called )
2019-08-20 09:14:07 +00:00
for ds in ds_content [ ' data_sources ' ] :
# check for missing keys
for key in [ ' data_source_name ' , ' date_registered ' , ' date_connected ' , ' products ' , ' available_for_data_analytics ' , ' comment ' , ' data_quality ' ] :
if key not in ds :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Data source: \' ' + ds [ ' data_source_name ' ] +
' \' is MISSING a key-value pair: ' + key , health_is_called )
2019-08-20 09:14:07 +00:00
for key in [ ' date_registered ' , ' date_connected ' ] :
if key in ds and not ds [ key ] is None :
try :
2020-02-10 11:17:00 +00:00
# pylint: disable=pointless-statement
2019-08-20 09:14:07 +00:00
ds [ key ] . year
2020-02-10 11:17:00 +00:00
# pylint: disable=pointless-statement
2019-08-20 09:14:07 +00:00
ds [ key ] . month
2020-02-10 11:17:00 +00:00
# pylint: disable=pointless-statement
2019-08-20 09:14:07 +00:00
ds [ key ] . day
except AttributeError :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Data source: \' ' + ds [ ' data_source_name ' ] + ' \' has an INVALID data format for the key-value pair \' ' + key
2019-08-20 09:14:07 +00:00
+ ' \' : ' + ds [ key ] + ' (should be YYYY-MM-DD without quotes) ' , health_is_called )
if ' available_for_data_analytics ' in ds :
if not isinstance ( ds [ ' available_for_data_analytics ' ] , bool ) :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Data source: \' ' + ds [ ' data_source_name ' ] +
' \' has an INVALID \' available_for_data_analytics \' value: should be set to \' true \' or \' false \' ' , health_is_called )
2019-08-20 09:14:07 +00:00
if ' data_quality ' in ds :
if isinstance ( ds [ ' data_quality ' ] , dict ) :
for dimension in [ ' device_completeness ' , ' data_field_completeness ' , ' timeliness ' , ' consistency ' , ' retention ' ] :
if dimension not in ds [ ' data_quality ' ] :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Data source: \' ' + ds [ ' data_source_name ' ] +
' \' is MISSING a key-value pair in \' data_quality \' : ' + dimension , health_is_called )
2019-08-20 09:14:07 +00:00
else :
if isinstance ( ds [ ' data_quality ' ] [ dimension ] , int ) :
if not 0 < = ds [ ' data_quality ' ] [ dimension ] < = 5 :
has_error = _print_error_msg ( ' [!] Data source: \' ' + ds [ ' data_source_name ' ] + ' \' has an INVALID data quality score for the dimension \' '
+ dimension + ' \' : ' + str ( ds [ ' data_quality ' ] [ dimension ] ) + ' (should be between 0 and 5) ' , health_is_called )
else :
has_error = _print_error_msg ( ' [!] Data source: \' ' + ds [ ' data_source_name ' ] + ' \' has an INVALID data quality score for the dimension \' ' +
dimension + ' \' : ' + str ( ds [ ' data_quality ' ] [ dimension ] ) + ' (should be an an integer) ' , health_is_called )
else :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Data source: \' ' + ds [ ' data_source_name ' ] +
' \' the key-value pair \' data_quality \' is NOT a dictionary with data quality dimension scores ' , health_is_called )
2019-11-05 09:21:42 +00:00
if ' exceptions ' in ds_content :
for tech in ds_content [ ' exceptions ' ] :
tech_id = str ( tech [ ' technique_id ' ] )
2019-10-17 11:51:06 +00:00
if not REGEX_YAML_TECHNIQUE_ID_FORMAT . match ( tech_id ) and tech_id != ' None ' :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg (
' [!] INVALID technique ID in the \' exceptions \' list of data source admin. file: ' + tech_id , health_is_called )
2019-08-20 09:14:07 +00:00
if has_error and not health_is_called and not no_print :
print ( HEALTH_ERROR_TXT + filename )
_update_health_state_cache ( filename , has_error )
return has_error
def _check_health_score_object ( yaml_object , object_type , tech_id , health_is_called ) :
"""
Check the health of a score_logbook inside a visibility or detection YAML object
: param yaml_object : YAML file lines
: param object_type : ' detection ' or ' visibility '
: param tech_id : ATT & CK technique ID
: param health_is_called : boolean that specifies if detailed errors in the file will be printed
: return : True if the YAML file is unhealthy , otherwise False
"""
has_error = False
min_score = None
max_score = None
if object_type == ' detection ' :
min_score = - 1
max_score = 5
elif object_type == ' visibility ' :
min_score = 0
max_score = 4
if not isinstance ( yaml_object [ ' score_logbook ' ] , list ) :
yaml_object [ ' score_logbook ' ] = [ yaml_object [ ' score_logbook ' ] ]
try :
for score_obj in yaml_object [ ' score_logbook ' ] :
for key in [ ' date ' , ' score ' , ' comment ' ] :
if key not in score_obj :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech_id + ' is MISSING a key-value pair in a ' +
object_type + ' score object within the \' score_logbook \' : ' + key , health_is_called )
2019-08-20 09:14:07 +00:00
if score_obj [ ' score ' ] is None :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech_id + ' has an EMPTY key-value pair in a ' +
object_type + ' score object within the \' score_logbook \' : score ' , health_is_called )
2019-08-20 09:14:07 +00:00
elif not isinstance ( score_obj [ ' score ' ] , int ) :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech_id + ' has an INVALID score format in a ' + object_type +
' score object within the \' score_logbook \' : ' + score_obj [ ' score ' ] + ' (should be an integer) ' , health_is_called )
2019-08-20 09:14:07 +00:00
if ' auto_generated ' in score_obj :
if not isinstance ( score_obj [ ' auto_generated ' ] , bool ) :
has_error = _print_error_msg (
' [!] Technique ID: ' + tech_id + ' has an INVALID \' auto_generated \' value in a ' + object_type + ' score object within the \' score_logbook \' : should be set to \' true \' or \' false \' ' , health_is_called )
if isinstance ( score_obj [ ' score ' ] , int ) :
2019-12-12 14:21:27 +00:00
if score_obj [ ' date ' ] is None and ( ( score_obj [ ' score ' ] > - 1 and object_type == ' detection ' ) or ( score_obj [ ' score ' ] > 0 and object_type == ' visibility ' ) ) :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech_id + ' has an EMPTY key-value pair in a ' +
object_type + ' score object within the \' score_logbook \' : date ' , health_is_called )
2019-08-20 09:14:07 +00:00
if not ( score_obj [ ' score ' ] > = min_score and score_obj [ ' score ' ] < = max_score ) :
has_error = _print_error_msg (
' [!] Technique ID: ' + tech_id + ' has an INVALID ' + object_type + ' score in a score object within the \' score_logbook \' : ' + str ( score_obj [ ' score ' ] ) + ' (should be between ' + str ( min_score ) + ' and ' + str ( max_score ) + ' ) ' , health_is_called )
if not score_obj [ ' date ' ] is None :
try :
2020-02-10 11:17:00 +00:00
# pylint: disable=pointless-statement
2019-08-20 09:14:07 +00:00
score_obj [ ' date ' ] . year
2020-02-10 11:17:00 +00:00
# pylint: disable=pointless-statement
2019-08-20 09:14:07 +00:00
score_obj [ ' date ' ] . month
2020-02-10 11:17:00 +00:00
# pylint: disable=pointless-statement
2019-08-20 09:14:07 +00:00
score_obj [ ' date ' ] . day
except AttributeError :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech_id + ' has an INVALID data format in a ' + object_type +
' score object within the \' score_logbook \' : ' + score_obj [ ' date ' ] + ' (should be YYYY-MM-DD without quotes) ' , health_is_called )
2019-08-20 09:14:07 +00:00
except KeyError :
pass
return has_error
def _check_health_techniques ( filename , technique_content , health_is_called ) :
"""
Check on errors in the provided technique administration YAML file .
: param filename : YAML file location
: param technique_content : content of the YAML file in a list of dicts
: param health_is_called : boolean that specifies if detailed errors in the file will be printed to stdout
: return :
"""
from generic import load_techniques
has_error = False
2019-11-04 13:48:58 +00:00
platform = technique_content . get ( ' platform ' , None )
2019-11-05 09:21:42 +00:00
if platform != ' all ' and platform != [ ' all ' ] :
if isinstance ( platform , str ) :
platform = [ platform ]
if platform is None or len ( platform ) == 0 or platform == ' ' :
platform = [ ' empty ' ]
2019-11-04 13:48:58 +00:00
for p in platform :
2019-11-05 09:21:42 +00:00
if p . lower ( ) not in PLATFORMS . keys ( ) :
2019-11-04 13:48:58 +00:00
has_error = _print_error_msg (
2019-11-05 09:21:42 +00:00
' [!] EMPTY or INVALID value for \' platform \' within the data source admin. '
' file: %s (should be value(s) of: [ %s ] or all) ' % ( p , ' , ' . join ( list ( PLATFORMS . values ( ) ) ) ) ,
2019-11-04 13:48:58 +00:00
health_is_called )
2019-08-20 09:14:07 +00:00
# create a list of ATT&CK technique IDs and check for duplicates
tech_ids = list ( map ( lambda x : x [ ' technique_id ' ] , technique_content [ ' techniques ' ] ) )
tech_dup = set ( )
for tech in tech_ids :
if tech not in tech_dup :
tech_dup . add ( tech )
else :
has_error = _print_error_msg ( ' [!] Duplicate technique ID: ' + tech , health_is_called )
# check if the technique has a valid format
if not REGEX_YAML_TECHNIQUE_ID_FORMAT . match ( tech ) :
has_error = _print_error_msg ( ' [!] Invalid technique ID: ' + tech , health_is_called )
all_applicable_to = set ( )
techniques = load_techniques ( filename )
for tech , v in techniques [ 0 ] . items ( ) :
for obj_type in [ ' detection ' , ' visibility ' ] :
if obj_type not in v :
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech + ' is MISSING a key-value pair: ' + obj_type , health_is_called )
else :
for obj in v [ obj_type ] :
obj_keys = [ ' applicable_to ' , ' comment ' , ' score_logbook ' ]
obj_keys_list = [ ' applicable_to ' ]
2019-11-29 11:22:10 +00:00
obj_keys_not_none = [ ]
2019-12-05 15:07:02 +00:00
obj_keys_not_none . append ( ' applicable_to ' )
2019-08-20 09:14:07 +00:00
if obj_type == ' detection ' :
obj_keys . append ( ' location ' )
obj_keys_list . append ( ' location ' )
2019-11-29 11:22:10 +00:00
obj_keys_not_none . append ( ' location ' )
2019-08-20 09:14:07 +00:00
for okey in obj_keys :
if okey not in obj :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech +
' is MISSING a key-value pair in \' ' + obj_type + ' \' : ' + okey , health_is_called )
2019-08-20 09:14:07 +00:00
for okey in obj_keys_list :
if okey in obj :
if not isinstance ( obj [ okey ] , list ) :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech + ' the key-value pair \' ' + okey +
' \' in \' ' + obj_type + ' \' is NOT a list ' , health_is_called )
2019-08-20 09:14:07 +00:00
2019-11-29 11:22:10 +00:00
for okey in obj_keys_not_none :
if okey in obj :
none_count = 0
for item in obj [ okey ] :
if item is None :
none_count + = 1
if none_count == 1 :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech + ' the key-value pair \' ' + okey + ' \' in \' ' +
obj_type + ' \' has an EMPTY value (an empty string is allowed: \' \' ) ' , health_is_called )
2019-11-29 11:22:10 +00:00
elif none_count > 1 :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg ( ' [!] Technique ID: ' + tech + ' the key-value pair \' ' + okey + ' \' in \' ' + obj_type +
' \' has multiple EMPTY values (an empty string is allowed: \' \' ) ' , health_is_called )
2019-11-29 11:22:10 +00:00
2019-08-20 09:14:07 +00:00
health = _check_health_score_object ( obj , obj_type , tech , health_is_called )
has_error = _update_health_state ( has_error , health )
if ' applicable_to ' in obj and isinstance ( obj [ ' applicable_to ' ] , list ) :
all_applicable_to . update ( obj [ ' applicable_to ' ] )
# get values within the key-value pair 'applicable_to' and 'location' which are a very close match
similar = set ( )
for i1 in all_applicable_to :
for i2 in all_applicable_to :
match_value = SequenceMatcher ( None , i1 , i2 ) . ratio ( )
if match_value > 0.8 and match_value != 1 :
similar . add ( i1 )
similar . add ( i2 )
if len ( similar ) > 0 :
2020-02-10 11:17:00 +00:00
has_error = _print_error_msg (
' [!] There are values in the key-value pairs for \' applicable_to \' which are very similar. Correct where necessary: ' , health_is_called )
2019-08-20 09:14:07 +00:00
for s in similar :
_print_error_msg ( ' - ' + s , health_is_called )
if has_error and not health_is_called :
print ( HEALTH_ERROR_TXT + filename )
_update_health_state_cache ( filename , has_error )
def check_yaml_file_health ( filename , file_type , health_is_called ) :
"""
Check on errors in the provided YAML file .
: param filename : YAML file location
: param file_type : currently FILE_TYPE_TECHNIQUE_ADMINISTRATION and FILE_TYPE_DATA_SOURCE_ADMINISTRATION is supported
: param health_is_called : boolean that specifies if detailed errors in the file will be printed to stdout
: return :
"""
from generic import init_yaml
# first we check if the file was modified. Otherwise, the health check is skipped for performance reasons
if _is_file_modified ( filename ) or health_is_called :
_yaml = init_yaml ( )
with open ( filename , ' r ' ) as yaml_file :
yaml_content = _yaml . load ( yaml_file )
if file_type == FILE_TYPE_DATA_SOURCE_ADMINISTRATION :
check_health_data_sources ( filename , yaml_content , health_is_called )
elif file_type == FILE_TYPE_TECHNIQUE_ADMINISTRATION :
_check_health_techniques ( filename , yaml_content , health_is_called )
elif _get_health_state_cache ( filename ) :
print ( HEALTH_ERROR_TXT + filename )