From 9cbdac310508b7ac62675335f0840bccc1bd5d24 Mon Sep 17 00:00:00 2001 From: Faber Date: Mon, 24 Jun 2019 21:36:54 -0400 Subject: [PATCH 1/6] updates to extractor --- .DS_Store | Bin 0 -> 6148 bytes cortexutils/extractor.py | 99 +++++++++++++++++++++++++++++++--- tests/test_suite_extractor.py | 1 - 3 files changed, 91 insertions(+), 9 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..59b0d3ce6786842f37e6894150307a3074424ade GIT binary patch literal 6148 zcmeHK%Sr=55Ukc50=eYqael!+7()C4e?XFW5CRLvfX6-gU4EL?4}!}^@Zv?Pp}S^! zx@OrrY;Oaw&G~Q-tN|?Pj`;8}H$Qiu*-d4PNar2Tc*hxUc)@X2eLdma74{hLO5S(= z>2TT~cf;`8o37ultQ3#}Qa}nw0Vyz5zU!arm~>bTA68GcnoumB&ih-G!+N5k z6p#W_1#WY@_Wu7s|6%^0lC+ZoQs7@HV2kbJcEeYy-a30Z@3oEoME9Ccx*OL)VTg82 jjCRb8x8u7g%DU!jp7+8rG3d+(ov5Dy*F`1;{#t=A 1: + return parts.parsed_url.netloc + return None + + def __init_regex(self): """ Returns compiled regex list. @@ -35,9 +53,15 @@ def __init_regex(): """ # IPv4 + ft_r = '(?:' + \ + '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.)' + \ + '{3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?' + \ + ')' regex = [{ 'type': 'ip', - 'regex': re.compile(r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}') + 'regex': re.compile(r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}'), + 'ft_regex': re.compile(r'{}'.format(ft_r)), + 'validator': self.__valid_ip }] # IPv6 @@ -66,21 +90,30 @@ def __init_regex(): }) # URL + ft_r = '(' + \ + '(?:(?:meows?|h[Xxt]{2}ps?)://)?(?:(?:(?:[a-zA-Z0-9\-]+\[?\.\]?)+[a-z]{2,8})' + \ + '|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\[?\.\]?){3}(?:25[0-5]|2[0-4][0-9]' + \ + '|[01]?[0-9][0-9]?))/[^\s\<"]+' + \ + ')' regex.append({ 'type': 'url', - 'regex': re.compile(r'^(http://|https://)') + 'regex': re.compile(r'^(http://|https://)'), + 'ft_regex': re.compile(r'{}'.format(ft_r)) }) # domain regex.append({ 'type': 'domain', - 'regex': re.compile(r'^(?!http://|https://)^[\w\-]+\.[a-zA-Z]+$') + 'regex': re.compile(r'^(?!http://|https://)^[\w\-]+\.[a-zA-Z]+$'), + 'ft_regex': re.compile(r'[\s\>\ Date: Wed, 26 Jun 2019 23:36:51 -0400 Subject: [PATCH 2/6] fixed extractor unit test and removed .DS_Store file/updated .gitignore. still need to add unit test for new features --- .DS_Store | Bin 6148 -> 0 bytes .gitignore | 3 +++ tests/test_suite_extractor.py | 34 ++++++++++++++++++---------------- 3 files changed, 21 insertions(+), 16 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 59b0d3ce6786842f37e6894150307a3074424ade..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%Sr=55Ukc50=eYqael!+7()C4e?XFW5CRLvfX6-gU4EL?4}!}^@Zv?Pp}S^! zx@OrrY;Oaw&G~Q-tN|?Pj`;8}H$Qiu*-d4PNar2Tc*hxUc)@X2eLdma74{hLO5S(= z>2TT~cf;`8o37ultQ3#}Qa}nw0Vyz5zU!arm~>bTA68GcnoumB&ih-G!+N5k z6p#W_1#WY@_Wu7s|6%^0lC+ZoQs7@HV2kbJcEeYy-a30Z@3oEoME9Ccx*OL)VTg82 jjCRb8x8u7g%DU!jp7+8rG3d+(ov5Dy*F`1;{#t=A Date: Thu, 27 Jun 2019 00:25:30 -0400 Subject: [PATCH 3/6] simplified recursive function and fixed bug that presented it self in list/dicts for full text --- cortexutils/extractor.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/cortexutils/extractor.py b/cortexutils/extractor.py index 0013f4c..492f15c 100644 --- a/cortexutils/extractor.py +++ b/cortexutils/extractor.py @@ -246,32 +246,16 @@ def check_iterable(self, iterable): for dt in types: for val in types[dt]: results.append({ - 'type': dt, - 'value': val + 'dataType': dt, + 'data': val }) elif isinstance(iterable, list): for item in iterable: - if isinstance(item, list) or isinstance(item, dict): - results.extend(self.check_iterable(item)) - else: - dt = self.__checktype(item) - if len(dt) > 0: - results.append({ - 'dataType': dt, - 'data': item - }) + results.extend(self.check_iterable(item)) elif isinstance(iterable, dict): for _, item in iterable.items(): - if isinstance(item, list) or isinstance(item, dict): - results.extend(self.check_iterable(item)) - else: - dt = self.__checktype(item) - if len(dt) > 0: - results.append({ - 'dataType': dt, - 'data': item - }) + results.extend(self.check_iterable(item)) else: raise TypeError('Not supported type.') From b22efdb6a479bf1d69c1b5295f078514c155dc04 Mon Sep 17 00:00:00 2001 From: Faber Date: Thu, 27 Jun 2019 21:43:15 -0400 Subject: [PATCH 4/6] minor edits to extractor code --- cortexutils/extractor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cortexutils/extractor.py b/cortexutils/extractor.py index 492f15c..7bc5b1d 100644 --- a/cortexutils/extractor.py +++ b/cortexutils/extractor.py @@ -113,7 +113,7 @@ def __init_regex(self): regex.append({ 'type': 'hash', 'regex': re.compile(r'^([0-9a-fA-F]{32}|[0-9a-fA-F]{40}|[0-9a-fA-F]{64})$'), - 'ft_regex': re.compile(r'([0-9a-fA-F]{32}|[0-9a-fA-F]{40}|[0-9a-fA-F]{64})') + 'ft_regex': re.compile(r'([0-9a-fA-F]{32}|[0-9a-fA-F]{40}|[0-9a-fA-F]{64})[\s\>\ Date: Thu, 27 Jun 2019 21:43:51 -0400 Subject: [PATCH 5/6] unit tests to cover code changes --- tests/test_suite_extractor.py | 60 +++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/tests/test_suite_extractor.py b/tests/test_suite_extractor.py index bc54e4e..7d2eec0 100644 --- a/tests/test_suite_extractor.py +++ b/tests/test_suite_extractor.py @@ -97,12 +97,66 @@ def test_single_regkey(self): 'registry single string: wrong data type.' ) + def test_text_ip(self): + text = 'This is a string with an IP 8.8.8.8 embedded' + self.assertEqual( + self.extractor.extract_matches(value=text), + { + 'ip': ['8.8.8.8'] + }, + 'ip in text: failed.' + ) + + def test_text_url(self): + text = 'This is a string with a url http://www.somebaddomain.com/badness/bad embedded' + self.assertEqual( + self.extractor.extract_matches(value=text), + { + 'url': ['http://www.somebaddomain.com/badness/bad'], + 'domain': [u'somebaddomain.com'], + 'fqdn': [u'www.somebaddomain.com'] + }, + 'url in text: failed.' + ) + + def test_text_hash(self): + text = '''b373bd6b144e7846f45a1e47eed380b7 This is a string with an hashes b373bd6b144e7846f45a1e47ced380b8 and + 7ef8b3dc5bf40268f66721a89b95f4c5f0cc08e34836f8c3a007ceed193654d4 embedded + ''' + self.assertEqual( + self.extractor.extract_matches(value=text), + { + 'hash': [ + 'b373bd6b144e7846f45a1e47eed380b7', + 'b373bd6b144e7846f45a1e47ced380b8', + '7ef8b3dc5bf40268f66721a89b95f4c5f0cc08e34836f8c3a007ceed193654d4' + ] + }, + 'hash in text: failed.' + ) + + def test_text_email(self): + text = 'This is a string with a url myemail@gmail.com and joe.smith@somecorp.org embedded' + self.assertEqual( + self.extractor.extract_matches(value=text), + { + 'mail': [ + 'myemail@gmail.com', + 'joe.smith@somecorp.org' + ] + }, + 'email in text: failed.' + ) + def test_iterable(self): l_real = self.extractor.check_iterable({ 'results': [ { 'This is an totally unimportant key': '8.8.8.8' }, + { + 'This is an IP in text': 'This is a really bad IP 8.8.8.9 serving malware' + }, { 'Totally nested!': ['https://nestedurl.verynested.com'] } @@ -119,6 +173,10 @@ def test_iterable(self): 'dataType': 'ip', 'data': '8.8.8.8' }, + { + 'dataType': 'ip', + 'data': '8.8.8.9' + }, { 'dataType': 'url', 'data': 'https://nestedurl.verynested.com' @@ -140,8 +198,6 @@ def test_iterable(self): # Sorting the lists l_real = sorted(l_real, key=lambda k: k['data']) l_expected = sorted(l_expected, key=lambda k: k['data']) - print l_real - print l_expected self.assertEqual( l_real, From d7fd6dad2165f895aa8f66b4d6cd954875af5b0c Mon Sep 17 00:00:00 2001 From: Faber Date: Thu, 27 Jun 2019 21:50:07 -0400 Subject: [PATCH 6/6] added requirements file --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..54a1da2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +ipaddress==1.0.22 +tld==0.9.3