From 3481974882a83fdb74b0c6cd5ccf6d2139191391 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnama1@cvut.cz>
Date: Fri, 22 Sep 2023 08:49:40 +0200
Subject: [PATCH 01/21] Threat categorization

---
 NERDd/blacklists.py             |  40 ++++++-
 NERDd/core/update_manager.py    |   7 +-
 NERDd/dshield.py                |  11 +-
 NERDd/misp_receiver.py          |  70 +++++++++--
 NERDd/otx_receiver.py           |  27 ++++-
 NERDd/warden_receiver.py        |  64 ++++++++--
 NERDweb/nerd_main.py            |  27 +++++
 NERDweb/static/ips.js           |   4 +
 NERDweb/static/style.css        |  13 +++
 NERDweb/templates/ips.html      |  26 ++++-
 common/threat_categorization.py |  45 +++++++
 etc/threat_categorization.yml   | 200 ++++++++++++++++++++++++++++++++
 12 files changed, 502 insertions(+), 32 deletions(-)
 create mode 100644 common/threat_categorization.py
 create mode 100644 etc/threat_categorization.yml

diff --git a/NERDd/blacklists.py b/NERDd/blacklists.py
index 0152acc3..1691b931 100755
--- a/NERDd/blacklists.py
+++ b/NERDd/blacklists.py
@@ -18,6 +18,7 @@
 # Add to path the "one directory above the current file location" to find modules from "common"
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))
 
+from common.threat_categorization import load_categorization_config
 from common.utils import parse_rfc_time
 import common.config
 import common.task_queue
@@ -39,6 +40,39 @@
     'ip': {'singular': "IP", 'plural': "IPs"}
 }
 
+###############################################################################
+# Threat categorization
+
+categorization_config = None
+blacklist_to_category = None
+
+
+def categorization_init():
+    global categorization_config
+    global blacklist_to_category
+
+    categorization_config = load_categorization_config("blacklists")
+
+    blacklist_to_category = {}
+    for category_id, category_config in categorization_config.items():
+        for blacklist_id in category_config.get("blacklists", []):
+            blacklist_to_category[blacklist_id] = category_id
+
+
+def classify_blacklist(blacklist_id):
+    global categorization_config
+    global blacklist_to_category
+
+    if categorization_config is None:
+        categorization_init()
+
+    if blacklist_id in blacklist_to_category:
+        category_id = blacklist_to_category[blacklist_id]
+        ip_role = categorization_config[category_id]["role"]
+        return ip_role, category_id
+    else:
+        return "src", "unknown"
+
 
 ###############################################################################
 
@@ -159,11 +193,15 @@ def get_blacklist(id, name, url, regex, bl_type, life_length, params):
 
     log.info("{} IPs found in '{}', sending tasks to NERD workers".format(len(bl_records), id))
 
+    ip_role, ip_category = classify_blacklist(id)
+
     for ip in bl_records:
         task_queue_writer.put_task('ip', ip, [
             ('setmax', '_ttl.bl', now_plus_life_length),
             ('array_upsert', 'bl', {'n': id},
-                [('set', 'v', 1), ('set', 't', download_time), ('append', 'h', download_time)])
+                [('set', 'v', 1), ('set', 't', download_time), ('append', 'h', download_time)]),
+            ('array_upsert', 'threat_category', {'id': ip_category, 'role': ip_role},
+                [('add_to_set', 'blacklists', id)])
         ], "blacklists")
 
 
diff --git a/NERDd/core/update_manager.py b/NERDd/core/update_manager.py
index 9f52d7eb..025f9167 100644
--- a/NERDd/core/update_manager.py
+++ b/NERDd/core/update_manager.py
@@ -202,9 +202,10 @@ def perform_update(rec, updreq):
         updates_performed = []
         for action in actions:
             upds = perform_update(item, action) # recursion
-            # List of all actions must be returned, convert relative keys to absolute
-            for inner_key, new_val in upds:
-                updates_performed.append((key + '[' + str(i) + '].' + inner_key, new_val))
+            if upds is not None:
+                # List of all actions must be returned, convert relative keys to absolute
+                for inner_key, new_val in upds:
+                    updates_performed.append((key + '[' + str(i) + '].' + inner_key, new_val))
         return updates_performed
     
     elif op == 'array_remove':
diff --git a/NERDd/dshield.py b/NERDd/dshield.py
index ff37584a..2a6ef5f4 100644
--- a/NERDd/dshield.py
+++ b/NERDd/dshield.py
@@ -89,11 +89,12 @@ def process_feed(feed_data):
         if (ips[ip_addr]["reports"] < min_reports) or (ips[ip_addr]["targets"] < min_targets):
             continue
         tq_writer.put_task('ip', ip_addr, [
-                                            ('array_upsert', 'dshield', {'date' : date_str},
-                                             [('set', 'reports', ips[ip_addr]["reports"]),
-                                              ('set', 'targets', ips[ip_addr]["targets"])]),
-                                            ('setmax', '_ttl.dshield', ttl_date),
-                                          ], "dshield")
+            ('array_upsert', 'dshield', {'date': date_str},
+             [('set', 'reports', ips[ip_addr]["reports"]),
+              ('set', 'targets', ips[ip_addr]["targets"])]),
+            ('setmax', '_ttl.dshield', ttl_date),
+            ('array_upsert', 'threat_category', {'id': 'scan', 'role': 'src'}, [('add', 'n_reports.dshield', ips[ip_addr]["reports"])])
+        ], "dshield")
     logger.info("Tasks created")
 
 def download_feed():
diff --git a/NERDd/misp_receiver.py b/NERDd/misp_receiver.py
index cdcc46b7..63bc94f9 100644
--- a/NERDd/misp_receiver.py
+++ b/NERDd/misp_receiver.py
@@ -27,6 +27,7 @@
 from common.config import read_config
 from common.task_queue import TaskQueueWriter
 from common.utils import int2ipstr
+from common.threat_categorization import ClassifiableEvent, load_categorization_config
 
 running_flag = True
 zmq_alive = False
@@ -98,6 +99,33 @@
 IP_MISP_TYPES = ["ip-src", "ip-dst", "ip-dst|port", "ip-src|port", "domain|ip"]
 THREAT_LEVEL_DICT = {'1': "High", '2': "Medium", '3': "Low", '4': "Undefined"}
 
+##############################################################################
+# Threat categorization
+
+categorization_config = None
+
+def classify_ip(event_data, ip_role):
+    global categorization_config
+    if categorization_config is None:
+        categorization_config = load_categorization_config("misp_receiver")
+
+    output = []
+    event = ClassifiableEvent("misp_receiver", event_data, ip_role)
+    for category_id, category_config in categorization_config.items():
+        for statement in category_config["triggers"]:
+            if eval(statement) is True:
+                subcategories = {}  # TODO
+                if ip_role == "src and dst at the same time":
+                    output.append({'id': category_id, 'role': 'src', 'subcategories': subcategories})
+                    output.append({'id': category_id, 'role': 'dst', 'subcategories': subcategories})
+                else:
+                    output.append({'id': category_id, 'role': ip_role, 'subcategories': subcategories})
+    if not output:
+        output = [{'id': 'unknown', 'role': ip_role, 'subcategories': {}}]
+    return output
+
+##############################################################################
+# Main module code
 
 def is_single_ip(ip_to_check):
     try:
@@ -265,18 +293,38 @@ def upsert_new_event(event, attrib, sighting_list, role=None):
     :param role: role of ip_address (src or|and dst)
     :return: None
     """
-    new_event = create_new_event(event, role if role is not None else get_role_of_ip(attrib['type']), sighting_list)
     ip_addr = get_ip_address(attrib)
+    ip_role = role if role is not None else get_role_of_ip(attrib['type'])
+    new_event = create_new_event(event, ip_role, sighting_list)
+    live_till = new_event['date'] + timedelta(days=inactive_ip_lifetime)
+
     # create update sets for NERD queue
-    updates = []
+    event_updates = []
     for k, v in new_event.items():
-        updates.append(('set', k, v))
-    live_till = new_event['date'] + timedelta(days=inactive_ip_lifetime)
-    tq_writer.put_task('ip', ip_addr, [
-        ('array_upsert', 'misp_events', {'misp_instance': misp_url, 'event_id': event['id']}, updates),
+        event_updates.append(('set', k, v))
+    updates = [
+        ('array_upsert', 'misp_events', {'misp_instance': misp_url, 'event_id': event['id']}, event_updates),
         ('setmax', '_ttl.misp', live_till),
         ('setmax', 'last_activity', new_event['date'])
-    ], "misp_receiver")
+    ]
+
+    # threat categorization updates
+    for category_data in classify_ip(new_event, ip_role):
+        subcategory_updates = []
+        for subcategory, values in category_data['subcategories'].items():
+            subcategory_updates.append(('extend_set', subcategory, values))
+        updates.append((
+            'array_upsert',
+            'threat_category',
+            {'id': category_data['id'], 'role': category_data['role']},
+            [('add', 'n_reports.misp_receiver', 1), *subcategory_updates]
+        ))
+
+    logger.info(f"Updates for {ip_addr}:")
+    logger.info(updates)
+
+    # put task in queue
+    tq_writer.put_task('ip', ip_addr, updates, "misp_receiver")
 
 
 def process_sighting_notification(sighting):
@@ -287,9 +335,10 @@ def process_sighting_notification(sighting):
     """
     try:
         # event which attribute was sighted
-        event = misp_inst.get(sighting['event_id'])['Event']
+        event = misp_inst.get_event(sighting['event_id'])['Event']
         # get sightings of attribute (rather set actual values of all sightings, than just add or remove 1 sighting)
-        sighting_list_response = misp_inst.sighting_list(int(sighting['attribute_id']))['response']
+        attr_id = int(sighting['attribute_id'])
+        sighting_list_response = misp_inst.search_sightings(context='attribute', context_id=attr_id)
         sighting_list = []
         for sighting_rec in sighting_list_response:
             sighting_list.append({'type': sighting_rec['Sighting']['type']})
@@ -491,7 +540,8 @@ def receive_events():
         #                            'xxx': "yyy",
         #                            ...... },
         #                   'action': "log"}
-        notification_prefix, _, notification = message.partition(" ")
+        notification_prefix, _, notification_str = message.partition(" ")
+        notification = json.loads(notification_str)
 
         # check message prefix, which defines actions
         if notification_prefix == "misp_json_audit":
diff --git a/NERDd/otx_receiver.py b/NERDd/otx_receiver.py
index bc781a7e..dacff513 100644
--- a/NERDd/otx_receiver.py
+++ b/NERDd/otx_receiver.py
@@ -31,6 +31,12 @@
 
 from OTXv2 import OTXv2
 
+# Add to path the "one directory above the current file location" to find modules from "common"
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))
+
+from common.threat_categorization import ClassifiableEvent, load_categorization_config
+
+
 def parse_datetime(time_str):
     # Parse ISO-formatted string with optional fractional part (datetime.fromisoformat would do it from Py>=3.7, but we still use Py3.6)
     if '.' in time_str:
@@ -94,6 +100,23 @@ def parse_datetime(time_str):
 
 scheduler = BlockingScheduler(timezone='UTC')
 
+# Threat categorization
+categorization_config = None
+
+
+def classify_ip(pulse):
+    global categorization_config
+    if categorization_config is None:
+        categorization_config = load_categorization_config("otx_receiver")
+
+    event = ClassifiableEvent("otx_receiver", pulse)
+
+    for category_id, category_config in categorization_config.items():
+        for statement in category_config["triggers"]:
+            if eval(statement) is True:
+                return category_config["role"], category_id
+    return "src", "unknown"
+
 
 def create_new_pulse(pulse, indicator):
     """
@@ -135,10 +158,12 @@ def upsert_new_pulse(pulse, indicator):
         live_till = current_time + timedelta(days=inactive_pulse_time)
     else:
         live_till = parse_datetime(indicator['expiration']) + timedelta(days=inactive_pulse_time)
+    ip_role, ip_category = classify_ip(pulse)
     tq_writer.put_task('ip', ip_addr, [
         ('array_upsert', 'otx_pulses', {'pulse_id': pulse['id']}, updates),
         ('setmax', '_ttl.otx', live_till),
-        ('setmax', 'last_activity', current_time)
+        ('setmax', 'last_activity', current_time),
+        ('array_upsert', 'threat_category', {'id': ip_category, 'role': ip_role}, [('add', 'n_reports.otx_receiver', 1)])
     ], "otx_receiver")
 
 
diff --git a/NERDd/warden_receiver.py b/NERDd/warden_receiver.py
index b17295f9..7dc36696 100644
--- a/NERDd/warden_receiver.py
+++ b/NERDd/warden_receiver.py
@@ -23,6 +23,7 @@
 # Add to path the "one directory above the current file location" to find modules from "common"
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))
 
+from common.threat_categorization import ClassifiableEvent, load_categorization_config
 from common.utils import parse_rfc_time
 import common.config
 import common.eventdb_psql
@@ -396,6 +397,34 @@ def should_pass(self, idea_message):
             # if no rule matched, then do default action
             return self.default_action()
 
+##############################################################################
+# Threat categorization
+
+categorization_config = None
+
+
+def classify_ip(event_data, source_data):
+    global categorization_config
+    if categorization_config is None:
+        categorization_config = load_categorization_config("warden_receiver")
+
+    output = []
+    event = ClassifiableEvent("warden_receiver", event_data, source_data)
+
+    for category_id, category_config in categorization_config.items():
+        for statement in category_config["triggers"]:
+            if eval(statement) is True:
+                subcategories = {}
+                if "port" in category_config["subcategories"] and event.target_ports:
+                    subcategories["port"] = event.target_ports
+                if "protocol" in category_config["subcategories"] and event.protocols:
+                    subcategories["protocol"] = event.protocols
+                if "malware_family" in category_config["subcategories"]:
+                    pass  # TODO
+                output.append({"id": category_id, "role": category_config["role"], "subcategories": subcategories})
+    if not output:
+        output = [{"id": "unknown", "role": "src", "subcategories": {}}]
+    return output
 
 ##############################################################################
 # Main module code
@@ -476,17 +505,30 @@ def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime,
                     # calculate the timestamp, to which the record should be kept
                     live_till = end_time + life_span
 
-                    task_queue_writer.put_task('ip', ipv4,
-                        [
-                            ('array_upsert', 'events',
-                             {'date': date, 'node': node, 'cat': cat},
-                             [('add', 'n', 1)]),
-                            ('add', 'events_meta.total', 1),
-                            ('setmax', 'last_activity', end_time),
-                            ('setmax', '_ttl.warden', live_till),
-                        ],
-                        "warden_receiver"
-                    )
+                    updates = [
+                        ('array_upsert', 'events',
+                         {'date': date, 'node': node, 'cat': cat},
+                         [('add', 'n', 1)]),
+                        ('add', 'events_meta.total', 1),
+                        ('setmax', 'last_activity', end_time),
+                        ('setmax', '_ttl.warden', live_till),
+                    ]
+
+                    # threat categorization updates
+                    for category_data in classify_ip(event, src):
+                        subcategory_updates = []
+                        for subcategory, values in category_data['subcategories'].items():
+                            subcategory_updates.append(('extend_set', subcategory, values))
+                        updates.append((
+                            'array_upsert',
+                            'threat_category',
+                            {'id': category_data['id'], 'role': category_data['role']},
+                            [('add', 'n_reports.warden_receiver', 1), *subcategory_updates]
+                        ))
+
+                    # put task in queue
+                    task_queue_writer.put_task('ip', ipv4, updates, "warden_receiver")
+
                 for ipv6 in src.get("IP6", []):
                     log.debug(
                         "IPv6 address in Source found - skipping since IPv6 is not implemented yet.")  # The record follows:\n{}".format(str(event)), file=sys.stderr)
diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index e9311ac6..9d9ca5bb 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -31,6 +31,7 @@
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))
 import common.config
 import common.task_queue
+import common.threat_categorization
 from common.utils import ipstr2int, int2ipstr, parse_rfc_time
 from shodan_rpc_client import ShodanRpcClient
 
@@ -811,7 +812,12 @@ class IPFilterForm(FlaskForm):
     source = SelectMultipleField('Source', [validators.Optional()])
     source_op = HiddenField('', default="or")
     cat = SelectMultipleField('Event category', [validators.Optional()]) # Choices are set up dynamically (see below)
+    tc_role = SelectMultipleField('Role', [validators.Optional()])
+    tc_category = SelectMultipleField('Category', [validators.Optional()])
+    tc_subcategory = StringField('Subcategory', [validators.Optional()], filters=[strip_whitespace])
     cat_op = HiddenField('', default="or")
+    tc_role_op = HiddenField('', default="or")
+    tc_category_op = HiddenField('', default="or")
     node = SelectMultipleField('', [validators.Optional()])
     node_op = HiddenField('', default="or")
     blacklist = SelectMultipleField('Blacklist', [validators.Optional()])
@@ -859,6 +865,11 @@ def __init__(self, *args, **kwargs):
         cnt_by_source = {item["_id"]: item["n"] for item in mongo.db.n_ip_by_ttl.find()}
         self.source.choices = [(src_id, '{} ({})'.format(src_name, int(cnt_by_source.get(src_id, 0)))) for src_id,src_name in source_names.items()]
 
+        # Load categorization config to get list of all categories
+        threat_categories = common.threat_categorization.load_categorization_config()
+        self.tc_role.choices = [("src", "Source"), ("dst", "Destination")]
+        self.tc_category.choices = [(cat_id, cat_data['label']) for cat_id, cat_data in threat_categories.items()]
+
         # Number of occurrences for blacklists (list of blacklists is taken from configuration)
         bl_name2num = {item['_id']: int(item['n']) for item in mongo.db.n_ip_by_bl.find()}
         dbl_name2num = {item['_id']: int(item['n']) for item in mongo.db.n_ip_by_dbl.find()}
@@ -939,6 +950,22 @@ def create_query(form):
     if form.cat.data:
         op = '$and' if (form.cat_op.data == "and") else '$or'
         queries.append({op: [{'events.cat': cat} for cat in form.cat.data]})
+    if form.tc_role.data or form.tc_category.data or form.tc_subcategory.data:
+        elem_match = {}
+        if form.tc_role.data:
+            role_op = '$and' if (form.tc_role_op.data == "and") else '$or'
+            elem_match.update({role_op: [{"role": role} for role in form.tc_role.data]})
+        if form.tc_subcategory.data:
+            subcategory_id, subcategory_value = form.tc_subcategory.data.split("=")
+            if subcategory_id == "port":
+                subcategory_value = int(subcategory_value)
+            elem_match.update({subcategory_id: subcategory_value})
+        if form.tc_category.data:
+            cat_op = '$and' if (form.tc_category_op.data == "and") else '$or'
+            query = {cat_op: [{"threat_category": {"$elemMatch": {**elem_match, "id": cat}}} for cat in form.tc_category.data]}
+        else:
+            query = {"threat_category": {"$elemMatch": elem_match}}
+        queries.append(query)
     if form.node.data:
         op = '$and' if (form.node_op.data == "and") else '$or'
         queries.append({op: [{'events.node': node} for node in form.node.data]})
diff --git a/NERDweb/static/ips.js b/NERDweb/static/ips.js
index 7696a2d2..a9fc6f52 100644
--- a/NERDweb/static/ips.js
+++ b/NERDweb/static/ips.js
@@ -7,6 +7,8 @@ function set_up_search_form() {
   $("select#country").multiselect({texts: {placeholder: "Any"}, search: true, selectAll: true});
   $("select#source").multiselect({texts: {placeholder: "Any"}, search: true});
   $("select#cat").multiselect({texts: {placeholder: "Any"}, search: true});
+  $("select#tc_role").multiselect({texts: {placeholder: "Any"}, search: true});
+  $("select#tc_category").multiselect({texts: {placeholder: "Any"}, search: true});
   $("select#node").multiselect({texts: {placeholder: "Any"}, search: true});
   $("select#blacklist").multiselect({texts: {placeholder: "Any"}, search: true});
   $("select#tag").multiselect({texts: {placeholder: "Any"}, search: true});
@@ -106,6 +108,8 @@ function set_up_search_form() {
 
     set_up_op_button("#source_op_button", "#source_op", "OR: At least one of the selected categories", "AND: All selected categories")
     set_up_op_button("#cat_op_button", "#cat_op", "OR: At least one of the selected categories", "AND: All selected categories")
+    set_up_op_button("#tc_role_op_button", "#tc_role_op", "OR: At least one of the selected roles", "AND: All selected roles")
+    set_up_op_button("#tc_category_op_button", "#tc_category_op", "OR: At least one of the selected categories", "AND: All selected categories")
     set_up_op_button("#node_op_button", "#node_op", "OR: At least one of the selected nodes", "AND: All selected nodes")
     set_up_op_button("#bl_op_button", "#bl_op", "OR: At least one of the selected blacklists", "AND: All selected blacklists")
     set_up_op_button("#tag_op_button", "#tag_op", "OR: At least one of the selected tags", "AND: All selected tags")
diff --git a/NERDweb/static/style.css b/NERDweb/static/style.css
index c31d6861..2da5c459 100644
--- a/NERDweb/static/style.css
+++ b/NERDweb/static/style.css
@@ -1244,6 +1244,19 @@ ul.data-list li ul li {
     font-weight: bold;
   }
 
+  #threat_category
+  {
+    display: flex;
+    flex-direction: row;
+    width: 813px;
+    justify-content: space-between;
+  }
+
+  #threat_category p
+  {
+    font-weight: bold;
+  }
+
 
   #searchForm
   {
diff --git a/NERDweb/templates/ips.html b/NERDweb/templates/ips.html
index eb4408a2..87c8e23f 100644
--- a/NERDweb/templates/ips.html
+++ b/NERDweb/templates/ips.html
@@ -154,7 +154,31 @@ <h1>Search IP addresses by ...</h1>
      </div>
     </div>
       {% endif %}
-     
+
+      <div class="row narrow" id="narrow">
+        <div id="sorting">
+          <div>
+            <p>Threat category</p>
+          </div>
+          <div class="opt">
+            <span class="lab">
+              Role
+              <span id="tc_role_op_button" class="op_button"><div class="or selected"><span>OR</span></div> <div class="and"><span>AND</span></div></span></span>
+             {{ form.tc_role() }}{{ form.tc_role_op() }}
+          </div>
+          <div class="opt">
+            <span class="lab">
+              Category
+              <span id="tc_category_op_button" class="op_button"><div class="or selected"><span>OR</span></div> <div class="and"><span>AND</span></div></span></span>
+             {{ form.tc_category() }}{{ form.tc_category_op() }}
+          </div>
+          <div class="opt">
+            <span class="lab">Subcategory</span>
+            {{ formfield(form.tc_subcategory, size=10) }}
+          </div>
+        </div>
+      </div>
+
       <div id="sorting">
         <div>
           <p>Sorting options</p>
diff --git a/common/threat_categorization.py b/common/threat_categorization.py
new file mode 100644
index 00000000..e02929c5
--- /dev/null
+++ b/common/threat_categorization.py
@@ -0,0 +1,45 @@
+import yaml
+
+
+class ClassifiableEvent:
+    def __getattr__(self, name):
+        return self.__dict__[name] if name in self.__dict__ else None
+
+    def __init__(self, module_name=None, *args):
+        init_fn = getattr(self, f"init_{module_name}")
+        init_fn(*args)
+
+    def init_warden_receiver(self, event, source):
+        self.categories = event.get('Category', [])
+        self.source_types = source.get('Type', [])
+        self.description = event.get("Description", "")
+        target_ports = []
+        protocols = source.get('Proto', [])
+        for target in event.get('Target', []):
+            target_ports += target.get('Port', [])
+            protocols += target.get('Proto', [])
+        self.target_ports = list(set(target_ports))
+        self.protocols = list(set(protocols))
+
+    def init_otx_receiver(self, pulse):
+        self.indicator_role = pulse.get('indicator_role', "")
+        self.indicator_title = pulse.get('indicator_title', "")
+        self.n_reports = 1  # TODO
+
+    def init_misp_receiver(self, event, ip_role):
+        self.tags = [tag["name"] for tag in event.get('tag_list', [])]
+        self.ip_role = ip_role
+
+
+def load_categorization_config(module_name=None):
+    categories = {}
+    categorization_config = yaml.safe_load(open("/etc/nerd/threat_categorization.yml"))
+    for category_id, category_config in categorization_config.items():
+        categories[category_id] = {
+            "label": category_config.get("label", ""),
+            "role": category_config.get("role", "src"),
+            "subcategories": category_config.get("subcategories", []),
+            "triggers": category_config.get("triggers", {}).get(module_name, "False").split("\n"),
+            "blacklists": category_config.get("blacklists", [])
+        }
+    return categories
diff --git a/etc/threat_categorization.yml b/etc/threat_categorization.yml
new file mode 100644
index 00000000..471e5a70
--- /dev/null
+++ b/etc/threat_categorization.yml
@@ -0,0 +1,200 @@
+
+unknown:
+  role: src
+  description: The IP was reported as a source of malicious/unexpected/rouge packets, but without any further specification.
+  label: Unknown
+
+scan:
+  role: src
+  description: The IP address performs a common network scanning, i.e. it tries to connect to various targets to search for open ports/services.
+  label: Scanning
+  subcategories:
+    - port
+  triggers:
+    warden_receiver: |-
+      'Recon.Scanning' in event.categories
+    otx_receiver: |-
+      event.indicator_role == 'scanning_host'
+    misp_receiver: |-
+      'CERT-XLM:information-gathering="scanner"' in event.tags
+      'ecsirt:information-gathering="scanner"' in event.tags
+      'circl:incident-classification="scan"' in event.tags
+
+bruteforce:
+  role: src
+  description: The IP performs dictionary (or bruteforce) attacks on password-protected services. Usually accompanied with scanning - searching for the targeted service.
+  label: Bruteforce
+  subcategories:
+    - protocol
+    - port
+  triggers:
+    warden_receiver: |-
+      'Attempt.Login' in event.categories
+    otx_receiver: |-
+      event.indicator_role == 'bruteforce'
+    misp_receiver: |-
+      'CERT-XLM:intrusion-attempts="login-attempts"' in event.tags
+      'ecsirt:intrusion-attempts="brute-force"' in event.tags
+  blacklists:
+    - bruteforceblocker
+    - blocklist_de-ssh
+    - blocklist_de-bruteforcelogin
+    - charles_the_haleys_ssh_dico_ips
+    - charles_the_haleys_smtp_dico_ips
+    - dataplane_org_sshclient
+    - dataplane_org_sshpwauth
+    - dataplane_org_telnet_login
+
+ddos:
+  role: src
+  description: The IP has been observed as a source of volumetric (D)DoS attacks.
+  label: DDoS
+  triggers:
+    warden_receiver: |-
+      'Availability.DoS' in event.categories
+      'Availability.DDoS' in event.categories
+    misp_receiver: |-
+      'DDoS' in event.tags
+      'CERT-XLM:availability="dos"' in event.tags
+      'CERT-XLM:availability="ddos"' in event.tags
+      'ecsirt:availability="dos"' in event.tags
+      'ecsirt:availability="ddos"' in event.tags
+      'circl:incident-classification="denial-of-service"' in event.tags
+
+ddos-amplifier:
+  role: dst
+  description: The IP runs a service which can be (and often is) misused as an amplifier for DDoS attacks, e.g. open DNS resolvers, NTP servers, memcached, etc.
+  label: DDoS amplifier
+  subcategories:
+    - protocol
+  triggers:
+    warden_receiver: |-
+      'Vulnerable.Config' in event.categories and 'dns' in event.protocols
+      'Vulnerable.Config' in event.categories and 'ntp' in event.protocols
+      'Vulnerable.Config' in event.categories and 'memcached' in event.protocols
+      'Backscatter' in event.source_types
+      'Open DNS Resolver' in event.description
+      'Open Memcached' in event.description
+      'Abusable NTP' in event.description
+
+spam:
+  role: src
+  description: The IP is sending spam.
+  label: Spam
+  triggers:
+    warden_receiver: |-
+      'Abusive.Spam' in event.categories
+      'OriginSpam' in event.source_types
+      'Spam' in event.source_types
+    misp_receiver: |-
+      'CERT-XLM:abusive-content="spam"' in event.tags
+      'ecsirt:abusive-content="spam"' in event.tags
+      'circl:incident-classification="spam"' in event.tags
+  blacklists:
+    - sblam_ips
+    - psbl
+    - spamhaus_edrop
+
+malware_distribution:
+  role: dst
+  description: The IP is used to distribute a malware, e.g. hosts an HTTP URL from which a malware is being downloaded.
+  label: Malware distribution
+  subcategories:
+    - malware_family
+  triggers:
+    warden_receiver: |-
+      'Malware.Virus' in event.categories
+      'Malware.Worm' in event.categories
+      'Malware.Trojan' in event.categories
+      'Malware.Spyware' in event.categories
+      'Malware.Dialer' in event.categories
+      'Malware.Rootkit' in event.categories
+      'Malware' in event.source_types
+      'OriginMalware' in event.source_types
+      'OriginSandbox' in event.source_types
+    otx_receiver: |-
+      event.indicator_role == 'trojan'
+      event.indicator_role == 'malware_hosting'
+    misp_receiver: |-
+      'MALWARE' in event.tags
+      'keylogger/infostealer' in event.tags
+      'Keylogger' in event.tags
+      'infostealer' in event.tags
+      'Ransomware' in event.tags
+      'Remote Access Trojan' in event.tags
+      'MalSpam' in event.tags
+  blacklists:
+    - urlhouse_ips
+
+cc:
+  role: dst
+  description: The IP is used as Command&Control server for a botnet/malware.
+  label: Command and control
+  subcategories:
+    - malware_family
+  triggers:
+    warden_receiver: |-
+      'CC' in event.source_types
+    otx_receiver: |-
+      event.indicator_role == 'command_and_control'
+    misp_receiver: |-
+      'kill-chain:Command and Control' in event.tags
+      'ecsirt:malicious-code="c&c"' in event.tags
+  blacklists:
+    - feodo
+    - bambenek_c2
+
+botnet_drone:
+  role: src
+  description: The IP is acting as a bot/drone of a botnet.
+  label: Botnet drone
+  subcategories:
+    - malware_family
+  triggers:
+    warden_receiver: |-
+      'Intrusion.Botnet' in event.categories
+      'Botnet' in event.source_types
+    misp_receiver: |-
+      'CERT-XLM:intrusion="botnet-member"' in event.tags
+      'ecsirt:malicious-code="botnet-drone"' in event.tags
+  blacklists:
+    - mirai_tracker_ips
+
+phishing_site:
+  role: dst
+  description: The IP is hosting a phishing website.
+  label: Phishing site
+  triggers:
+    warden_receiver: |-
+      'Fraud.Phishing' in event.categories
+      'Phishing' in event.source_types
+    misp_receiver: |-
+      'Phishing' in event.tags
+      'Phishing Site' in event.tags
+      'CERT-XLM:fraud="phishing"' in event.tags
+      'ecsirt:fraud="phishing"' in event.tags
+      'circl:incident-classification="phishing"' in event.tags
+      'circl:incident-classification="whaling"' in event.tags
+      'circl:incident-classification="smishing"' in event.tags
+  blacklists:
+    - openphish
+
+exploit:
+  role: src
+  description: The IP is attempting to exploit known vulnerabilities.
+  label: Exploit
+  subcategories:
+    - protocol
+  triggers:
+    warden_receiver: |-
+      'Attempt.Exploit' in event.categories
+    otx_receiver: |-
+      event.indicator_role == 'exploit_source'
+      event.indicator_role == 'exploit_kit'
+    misp_receiver: |-
+      'Exploit Kit' in event.tags
+      'CERT-XLM:intrusion-attempts="exploit-known-vuln"' in event.tags
+      'CERT-XLM:intrusion-attempts="new-attack-signature"' in event.tags
+      'ecsirt:intrusion-attempts="exploit"' in event.tags
+      'circl:incident-classification="XSS"' in event.tags
+      'circl:incident-classification="sql-injection"' in event.tags

From 1844debfcdbdeb229caa0cbb38fdf3f221e6a5e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Sat, 4 Nov 2023 10:15:43 +0100
Subject: [PATCH 02/21] Threat categorization updates

---
 NERDd/blacklists.py                   |  37 +++++---
 NERDd/dshield.py                      |   2 +-
 NERDd/misp_receiver.py                |  47 ++++++----
 NERDd/otx_receiver.py                 |  58 ++++++++++---
 NERDd/warden_receiver.py              |  41 ++++++---
 NERDweb/nerd_main.py                  |  91 ++++++++++++++++----
 NERDweb/static/main.js                |  29 +++++++
 NERDweb/static/style.css              |  47 ++++++++++
 NERDweb/templates/ip.html             |  17 ++++
 NERDweb/templates/ips.html            |   5 ++
 common/threat_categorization.py       | 118 +++++++++++++++++++++++++-
 etc/threat_categorization.yml         |  92 ++++++++++++--------
 scripts/download_malpedia_families.py |  33 +++++++
 13 files changed, 510 insertions(+), 107 deletions(-)
 create mode 100644 scripts/download_malpedia_families.py

diff --git a/NERDd/blacklists.py b/NERDd/blacklists.py
index 1691b931..c771b554 100755
--- a/NERDd/blacklists.py
+++ b/NERDd/blacklists.py
@@ -18,7 +18,7 @@
 # Add to path the "one directory above the current file location" to find modules from "common"
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))
 
-from common.threat_categorization import load_categorization_config
+from common.threat_categorization import *
 from common.utils import parse_rfc_time
 import common.config
 import common.task_queue
@@ -48,6 +48,11 @@
 
 
 def categorization_init():
+    """
+    Create a blacklist -> category mapping based on the categorization config in '/etc/threat_categorization.yml'
+
+    :return:
+    """
     global categorization_config
     global blacklist_to_category
 
@@ -55,11 +60,21 @@ def categorization_init():
 
     blacklist_to_category = {}
     for category_id, category_config in categorization_config.items():
-        for blacklist_id in category_config.get("blacklists", []):
-            blacklist_to_category[blacklist_id] = category_id
+        for line in category_config.get("triggers", []):
+            split_line = line.split("->")
+            blacklist_id = split_line[0]
+            subcategories = {}
+            if len(split_line) > 1:
+                subcategories = ast.literal_eval(split_line[1].lstrip())
+            blacklist_to_category[blacklist_id] = {"id": category_id, "role": category_config["role"], "subcategories": subcategories}
 
 
 def classify_blacklist(blacklist_id):
+    """
+    Assign a threat category based on the blacklist -> category mapping created by categorization_init()
+
+    :return: Assigned category
+    """
     global categorization_config
     global blacklist_to_category
 
@@ -67,11 +82,9 @@ def classify_blacklist(blacklist_id):
         categorization_init()
 
     if blacklist_id in blacklist_to_category:
-        category_id = blacklist_to_category[blacklist_id]
-        ip_role = categorization_config[category_id]["role"]
-        return ip_role, category_id
+        return blacklist_to_category[blacklist_id]
     else:
-        return "src", "unknown"
+        return {"role": "src", "id": "unknown", "subcategories": {}}
 
 
 ###############################################################################
@@ -193,15 +206,19 @@ def get_blacklist(id, name, url, regex, bl_type, life_length, params):
 
     log.info("{} IPs found in '{}', sending tasks to NERD workers".format(len(bl_records), id))
 
-    ip_role, ip_category = classify_blacklist(id)
+    category = classify_blacklist(id)
+    subcategory_updates = []
+    for subcategory, values in category['subcategories'].items():
+        subcategory_updates.append(('extend_set', subcategory, values))
+    log_category(id, "blacklists", category, None)
 
     for ip in bl_records:
         task_queue_writer.put_task('ip', ip, [
             ('setmax', '_ttl.bl', now_plus_life_length),
             ('array_upsert', 'bl', {'n': id},
                 [('set', 'v', 1), ('set', 't', download_time), ('append', 'h', download_time)]),
-            ('array_upsert', 'threat_category', {'id': ip_category, 'role': ip_role},
-                [('add_to_set', 'blacklists', id)])
+            ('array_upsert', 'threat_category', {'id': category["id"], 'role': category["role"]},
+                [('add_to_set', 'n_reports.blacklists', id)], *subcategory_updates)
         ], "blacklists")
 
 
diff --git a/NERDd/dshield.py b/NERDd/dshield.py
index 2a6ef5f4..6212f3eb 100644
--- a/NERDd/dshield.py
+++ b/NERDd/dshield.py
@@ -93,7 +93,7 @@ def process_feed(feed_data):
              [('set', 'reports', ips[ip_addr]["reports"]),
               ('set', 'targets', ips[ip_addr]["targets"])]),
             ('setmax', '_ttl.dshield', ttl_date),
-            ('array_upsert', 'threat_category', {'id': 'scan', 'role': 'src'}, [('add', 'n_reports.dshield', ips[ip_addr]["reports"])])
+            ('array_upsert', 'threat_category', {'id': 'scan', 'role': 'src'}, [('set', 'n_reports.dshield', ips[ip_addr]["reports"])])
         ], "dshield")
     logger.info("Tasks created")
 
diff --git a/NERDd/misp_receiver.py b/NERDd/misp_receiver.py
index 63bc94f9..5fc41072 100644
--- a/NERDd/misp_receiver.py
+++ b/NERDd/misp_receiver.py
@@ -27,7 +27,7 @@
 from common.config import read_config
 from common.task_queue import TaskQueueWriter
 from common.utils import int2ipstr
-from common.threat_categorization import ClassifiableEvent, load_categorization_config
+from common.threat_categorization import *
 
 running_flag = True
 zmq_alive = False
@@ -103,25 +103,42 @@
 # Threat categorization
 
 categorization_config = None
+malware_families = None
 
-def classify_ip(event_data, ip_role):
+def classify_ip(ip_addr, event_data, attrib, ip_role):
+    """
+    Assign a threat category based on the information provided in the incoming event
+
+    :return: List of assigned categories
+    """
     global categorization_config
+    global malware_families
     if categorization_config is None:
         categorization_config = load_categorization_config("misp_receiver")
+        malware_families = load_malware_families()
 
     output = []
-    event = ClassifiableEvent("misp_receiver", event_data, ip_role)
+    event = ClassifiableEvent("misp_receiver", event_data, attrib, ip_role)
     for category_id, category_config in categorization_config.items():
-        for statement in category_config["triggers"]:
-            if eval(statement) is True:
-                subcategories = {}  # TODO
-                if ip_role == "src and dst at the same time":
-                    output.append({'id': category_id, 'role': 'src', 'subcategories': subcategories})
-                    output.append({'id': category_id, 'role': 'dst', 'subcategories': subcategories})
-                else:
-                    output.append({'id': category_id, 'role': ip_role, 'subcategories': subcategories})
+        for trigger in category_config["triggers"]:
+            result, subcategories = eval_trigger(trigger, event)
+            if result is True:
+                if "port" in category_config["subcategories"]:
+                    ports_from_config = subcategories.get("port", [])
+                    if event.target_ports or ports_from_config:
+                        subcategories["port"] = list(set(event.target_ports + ports_from_config))
+                if "malware_family" in category_config["subcategories"]:
+                    for family_id, family_data in malware_families.items():
+                        if match_str(family_data["common_name"], event.attrib_comment) or \
+                           match_str(family_data["common_name"], event.info):
+                            if "malware_family" not in subcategories:
+                                subcategories["malware_family"] = [family_id]
+                            else:
+                                subcategories["malware_family"].append(family_id)
+                output.append({"id": category_id, "role": category_config["role"], "subcategories": subcategories})
     if not output:
-        output = [{'id': 'unknown', 'role': ip_role, 'subcategories': {}}]
+        output.append({"id": "unknown", "role": "src", "subcategories": {}})
+    log_category(ip_addr, "misp_receiver", output, event)
     return output
 
 ##############################################################################
@@ -309,7 +326,7 @@ def upsert_new_event(event, attrib, sighting_list, role=None):
     ]
 
     # threat categorization updates
-    for category_data in classify_ip(new_event, ip_role):
+    for category_data in classify_ip(ip_addr, new_event, attrib, ip_role):
         subcategory_updates = []
         for subcategory, values in category_data['subcategories'].items():
             subcategory_updates.append(('extend_set', subcategory, values))
@@ -320,8 +337,8 @@ def upsert_new_event(event, attrib, sighting_list, role=None):
             [('add', 'n_reports.misp_receiver', 1), *subcategory_updates]
         ))
 
-    logger.info(f"Updates for {ip_addr}:")
-    logger.info(updates)
+    logger.debug(f"Updates for {ip_addr}:")
+    logger.debug(updates)
 
     # put task in queue
     tq_writer.put_task('ip', ip_addr, updates, "misp_receiver")
diff --git a/NERDd/otx_receiver.py b/NERDd/otx_receiver.py
index dacff513..c6e257cd 100644
--- a/NERDd/otx_receiver.py
+++ b/NERDd/otx_receiver.py
@@ -34,8 +34,7 @@
 # Add to path the "one directory above the current file location" to find modules from "common"
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))
 
-from common.threat_categorization import ClassifiableEvent, load_categorization_config
-
+from common.threat_categorization import *
 
 def parse_datetime(time_str):
     # Parse ISO-formatted string with optional fractional part (datetime.fromisoformat would do it from Py>=3.7, but we still use Py3.6)
@@ -102,20 +101,41 @@ def parse_datetime(time_str):
 
 # Threat categorization
 categorization_config = None
+malware_families = None
+
 
+def classify_ip(ip_addr, pulse):
+    """
+    Assign a threat category based on the information provided in the incoming event
 
-def classify_ip(pulse):
+    :return: List of assigned categories
+    """
     global categorization_config
+    global malware_families
     if categorization_config is None:
         categorization_config = load_categorization_config("otx_receiver")
+        malware_families = load_malware_families()
 
+    output = []
     event = ClassifiableEvent("otx_receiver", pulse)
 
     for category_id, category_config in categorization_config.items():
-        for statement in category_config["triggers"]:
-            if eval(statement) is True:
-                return category_config["role"], category_id
-    return "src", "unknown"
+        for trigger in category_config["triggers"]:
+            result, subcategories = eval_trigger(trigger, event)
+            if result is True:
+                if "malware_family" in category_config["subcategories"]:
+                    for family_id, family_data in malware_families.items():
+                        if match_str(family_data["common_name"], event.indicator_title) or \
+                                match_str(family_data["common_name"], event.pulse_name):
+                            if "malware_family" not in subcategories:
+                                subcategories["malware_family"] = [family_id]
+                            else:
+                                subcategories["malware_family"].append(family_id)
+                output.append({"id": category_id, "role": category_config["role"], "subcategories": subcategories})
+    if not output:
+        output.append({"id": "unknown", "role": "src", "subcategories": {}})
+    log_category(ip_addr, "otx_receiver", output, event)
+    return output
 
 
 def create_new_pulse(pulse, indicator):
@@ -158,13 +178,27 @@ def upsert_new_pulse(pulse, indicator):
         live_till = current_time + timedelta(days=inactive_pulse_time)
     else:
         live_till = parse_datetime(indicator['expiration']) + timedelta(days=inactive_pulse_time)
-    ip_role, ip_category = classify_ip(pulse)
-    tq_writer.put_task('ip', ip_addr, [
+
+    updates = [
         ('array_upsert', 'otx_pulses', {'pulse_id': pulse['id']}, updates),
         ('setmax', '_ttl.otx', live_till),
-        ('setmax', 'last_activity', current_time),
-        ('array_upsert', 'threat_category', {'id': ip_category, 'role': ip_role}, [('add', 'n_reports.otx_receiver', 1)])
-    ], "otx_receiver")
+        ('setmax', 'last_activity', current_time)
+    ]
+
+    # threat categorization updates
+    for category_data in classify_ip(ip_addr, new_pulse):
+        subcategory_updates = []
+        for subcategory, values in category_data['subcategories'].items():
+            subcategory_updates.append(('extend_set', subcategory, values))
+        updates.append((
+            'array_upsert',
+            'threat_category',
+            {'id': category_data['id'], 'role': category_data['role']},
+            [('add', 'n_reports.otx_receiver', 1), *subcategory_updates]
+        ))
+
+    # put task in queue
+    tq_writer.put_task('ip', ip_addr, updates, "otx_receiver")
 
 
 def write_time(current_time):
diff --git a/NERDd/warden_receiver.py b/NERDd/warden_receiver.py
index 7dc36696..6e8c6c8f 100644
--- a/NERDd/warden_receiver.py
+++ b/NERDd/warden_receiver.py
@@ -23,7 +23,7 @@
 # Add to path the "one directory above the current file location" to find modules from "common"
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))
 
-from common.threat_categorization import ClassifiableEvent, load_categorization_config
+from common.threat_categorization import *
 from common.utils import parse_rfc_time
 import common.config
 import common.eventdb_psql
@@ -401,29 +401,46 @@ def should_pass(self, idea_message):
 # Threat categorization
 
 categorization_config = None
+malware_families = None
 
+def classify_ip(ip_addr, event_data, source_data):
+    """
+    Assign a threat category based on the information provided in the incoming event
 
-def classify_ip(event_data, source_data):
+    :return: List of assigned categories
+    """
     global categorization_config
+    global malware_families
     if categorization_config is None:
         categorization_config = load_categorization_config("warden_receiver")
+        malware_families = load_malware_families()
 
     output = []
     event = ClassifiableEvent("warden_receiver", event_data, source_data)
 
     for category_id, category_config in categorization_config.items():
-        for statement in category_config["triggers"]:
-            if eval(statement) is True:
-                subcategories = {}
-                if "port" in category_config["subcategories"] and event.target_ports:
-                    subcategories["port"] = event.target_ports
-                if "protocol" in category_config["subcategories"] and event.protocols:
-                    subcategories["protocol"] = event.protocols
+        for trigger in category_config["triggers"]:
+            result, subcategories = eval_trigger(trigger, event)
+            if result is True:
+                if "port" in category_config["subcategories"]:
+                    ports_from_config = subcategories.get("port", [])
+                    if event.target_ports or ports_from_config:
+                        subcategories["port"] = list(set(event.target_ports + ports_from_config))
+                if "protocol" in category_config["subcategories"]:
+                    protocols_from_config = subcategories.get("protocol", [])
+                    if event.protocols or protocols_from_config:
+                        subcategories["protocol"] = list(set(event.protocols + protocols_from_config))
                 if "malware_family" in category_config["subcategories"]:
-                    pass  # TODO
+                    for family_id, family_data in malware_families.items():
+                        if match_str(family_data["common_name"], event.description):
+                            if "malware_family" not in subcategories:
+                                subcategories["malware_family"] = [family_id.lower()]
+                            else:
+                                subcategories["malware_family"].append(family_id.lower())
                 output.append({"id": category_id, "role": category_config["role"], "subcategories": subcategories})
     if not output:
-        output = [{"id": "unknown", "role": "src", "subcategories": {}}]
+        output.append({"id": "unknown", "role": "src", "subcategories": {}})
+    log_category(ip_addr, "warden_receiver", output, event)
     return output
 
 ##############################################################################
@@ -515,7 +532,7 @@ def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime,
                     ]
 
                     # threat categorization updates
-                    for category_data in classify_ip(event, src):
+                    for category_data in classify_ip(ipv4, event, src):
                         subcategory_updates = []
                         for subcategory, values in category_data['subcategories'].items():
                             subcategory_updates.append(('extend_set', subcategory, values))
diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index 9d9ca5bb..a8a9098f 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -70,6 +70,9 @@
 p_bl_config = common.config.read_config(p_bl_cfg_file)
 dnsbl_config = common.config.read_config(dnsbl_cfg_file)
 
+# Threat categorization config
+threat_categorization_config = common.threat_categorization.load_categorization_config()
+
 # Dict: blacklist_id -> parameters
 #  parameters should contain:
 #    all: id, name, descr, feed_type
@@ -333,6 +336,11 @@ def strip_whitespace(s):
     return s
 
 
+def to_lower(s):
+    if isinstance(s, str):
+        s = s.lower()
+    return s
+
 # ***** Auxiliary functions *****
 
 def pseudonymize_node_name(name):
@@ -814,7 +822,10 @@ class IPFilterForm(FlaskForm):
     cat = SelectMultipleField('Event category', [validators.Optional()]) # Choices are set up dynamically (see below)
     tc_role = SelectMultipleField('Role', [validators.Optional()])
     tc_category = SelectMultipleField('Category', [validators.Optional()])
-    tc_subcategory = StringField('Subcategory', [validators.Optional()], filters=[strip_whitespace])
+    tc_subcategory = StringField('Subcategory', [
+        validators.Optional(),
+        validators.Regexp('^(\w+)=(\w+)$', re.IGNORECASE)
+    ], filters=[strip_whitespace, to_lower])
     cat_op = HiddenField('', default="or")
     tc_role_op = HiddenField('', default="or")
     tc_category_op = HiddenField('', default="or")
@@ -866,9 +877,8 @@ def __init__(self, *args, **kwargs):
         self.source.choices = [(src_id, '{} ({})'.format(src_name, int(cnt_by_source.get(src_id, 0)))) for src_id,src_name in source_names.items()]
 
         # Load categorization config to get list of all categories
-        threat_categories = common.threat_categorization.load_categorization_config()
         self.tc_role.choices = [("src", "Source"), ("dst", "Destination")]
-        self.tc_category.choices = [(cat_id, cat_data['label']) for cat_id, cat_data in threat_categories.items()]
+        self.tc_category.choices = [(cat_id, cat_data['label']) for cat_id, cat_data in threat_categorization_config.items()]
 
         # Number of occurrences for blacklists (list of blacklists is taken from configuration)
         bl_name2num = {item['_id']: int(item['n']) for item in mongo.db.n_ip_by_bl.find()}
@@ -951,21 +961,26 @@ def create_query(form):
         op = '$and' if (form.cat_op.data == "and") else '$or'
         queries.append({op: [{'events.cat': cat} for cat in form.cat.data]})
     if form.tc_role.data or form.tc_category.data or form.tc_subcategory.data:
-        elem_match = {}
-        if form.tc_role.data:
-            role_op = '$and' if (form.tc_role_op.data == "and") else '$or'
-            elem_match.update({role_op: [{"role": role} for role in form.tc_role.data]})
-        if form.tc_subcategory.data:
-            subcategory_id, subcategory_value = form.tc_subcategory.data.split("=")
-            if subcategory_id == "port":
-                subcategory_value = int(subcategory_value)
-            elem_match.update({subcategory_id: subcategory_value})
-        if form.tc_category.data:
-            cat_op = '$and' if (form.tc_category_op.data == "and") else '$or'
-            query = {cat_op: [{"threat_category": {"$elemMatch": {**elem_match, "id": cat}}} for cat in form.tc_category.data]}
+        if form.tc_role.data and len(form.tc_role.data) > 1 and form.tc_role_op.data == "and":
+            for role in form.tc_role.data:
+                queries.append({"threat_category": {"$elemMatch": {"role": role}}})
         else:
-            query = {"threat_category": {"$elemMatch": elem_match}}
-        queries.append(query)
+            query = {}
+            elem_match = {}
+            if form.tc_role.data:
+                role_op = '$and' if (form.tc_role_op.data == "and") else '$or'
+                elem_match.update({role_op: [{"role": role} for role in form.tc_role.data]})
+            if form.tc_subcategory.data:
+                subcategory_id, subcategory_value = form.tc_subcategory.data.split("=")
+                if subcategory_id == "port":
+                    subcategory_value = int(subcategory_value)
+                elem_match.update({subcategory_id: subcategory_value})
+            if form.tc_category.data:
+                cat_op = '$and' if (form.tc_category_op.data == "and") else '$or'
+                query = {cat_op: [{"threat_category": {"$elemMatch": {**elem_match, "id": cat}}} for cat in form.tc_category.data]}
+            else:
+                query = {"threat_category": {"$elemMatch": elem_match}}
+            queries.append(query)
     if form.node.data:
         op = '$and' if (form.node_op.data == "and") else '$or'
         queries.append({op: [{'events.node': node} for node in form.node.data]})
@@ -1124,6 +1139,37 @@ def ips():
                     showable_misp_events += 1
             ip['_showable_misp_events'] = showable_misp_events
 
+            # Add info about threat category
+            categories = ip.get("threat_category", [{"id": "unknown", "role": "src"}])
+            records = []
+            table_rows = []
+            total_reports = {}
+            for category in categories:
+                for module, n_reports in category.get("n_reports", {}).items():
+                    if module not in total_reports:
+                        total_reports[module] = 0
+                    if type(n_reports) is list:
+                        n_reports = len(n_reports)
+                    total_reports[module] += n_reports
+            for category in categories:
+                a = []
+                for module, n_reports in category.get("n_reports", {}).items():
+                    if type(n_reports) is list:
+                        n_reports = len(n_reports)
+                    c = n_reports / total_reports[module]
+                    a.append(c)
+                if category["id"] == "unknown" or len(a) == 0:
+                    confidence = 0.0
+                else:
+                    confidence = sum(a) / len(a)
+                records.append([category["role"], category["id"], round(confidence, 2)])
+            records_sorted = sorted(records, key=lambda rec: rec[2], reverse=True)
+            for record in records_sorted:
+                record[2] = str(record[2])
+                table_rows.append(",".join(record))
+            ip["_threat_category_table_preview"] = ";".join(table_rows)
+            ip["_threat_category_role"] = records_sorted[0][0]
+            ip["_threat_category_id"] = records_sorted[0][1]
     else:
         results = None
         if g.user and not g.ac('ipsearch'):
@@ -1313,6 +1359,17 @@ def ip(ipaddr=None):
                             asn_list.append(asn)
                 ipinfo['asns'] = asn_list
 
+                threat_category_table = []
+                if 'threat_category' in ipinfo:
+                    for category in ipinfo['threat_category']:
+                        category_config = threat_categorization_config.get(category["id"], {})
+                        subcategories = category_config.get("subcategories", [])
+                        subcategory_content = f"{subcategories[0]}: {category[subcategories[0]]}" if subcategories and subcategories[0] in category else ""
+                        threat_category_table.append([category["role"], category["id"], subcategory_content])
+                        for subcategory in subcategories[1:]:
+                            if subcategory in category:
+                                threat_category_table.append(["", "", f"{subcategory}: {category[subcategory]}"])
+
                 # Pseudonymize node names if user is not allowed to see the original names
                 if not g.ac('nodenames'):
                     for evtrec in ipinfo.get('events', []):
diff --git a/NERDweb/static/main.js b/NERDweb/static/main.js
index e9d6124f..4f52b374 100644
--- a/NERDweb/static/main.js
+++ b/NERDweb/static/main.js
@@ -26,6 +26,25 @@ function create_event_table(data) { /* data are "dataset" field of a DOM node wi
   return content;
 }
 
+function create_threat_category_table_preview(data) {
+  if (data.tctablepreview == "") {
+    return "No categories";
+  }
+  var table = [];
+  var table_rows = data.tctablepreview.split(";");
+  for (i = 0; i < table_rows.length; i++) {
+    table.push(table_rows[i].split(","));
+  }
+  var content = "<table><tr><th>Role</th><th>Category</th><th>Confidence</th></tr>";
+  for (i = 0; i < table.length; i++) {
+    content += "<tr><td>"
+    content += table[i].join("</td><td>");
+    content += "</td></tr>";
+  }
+  content += "</table>";
+  return content;
+}
+
 $(function() {
   /* jQuery UI tooltip at:
      - country flags (with name of the country)
@@ -53,6 +72,16 @@ $(function() {
     content: function() { return create_event_table(this.dataset) }, /*$(".tooltip_event_table", this).html(); },*/
     tooltipClass: "events_tooltip"
   });
+  /* jQuery UI tooltip at "threat category" cell with confidence table */
+  $( ".threat_category" ).tooltip({
+    items: ".threat_category",
+    track: false,
+    show: false,
+    hide: false,
+    position: {my: "left bottom", at: "left-7px top-2px", collision: "flipfit"},
+    content: function() { return create_threat_category_table_preview(this.dataset) },
+    tooltipClass: "threat_category_tooltip"
+  });
   /* jQuery UI tooltip at times with "timeago" */
   $( ".time" ).tooltip({
     items: ".time",
diff --git a/NERDweb/static/style.css b/NERDweb/static/style.css
index 2da5c459..c7a3eeb7 100644
--- a/NERDweb/static/style.css
+++ b/NERDweb/static/style.css
@@ -484,6 +484,53 @@ td.country a {
   color: inherit;
 }
 
+td.threat_category {
+  text-align: left;
+}
+td.threat_category span {
+  display: inline-block;
+  min-width: 1.3em;
+  padding: 0 0.2em;
+}
+td.threat_category span + span {
+  color: #222;
+  border-left: 2px dotted #888;
+  padding: 0 0.5em;
+}
+
+.threat_category_tooltip table {
+  border: solid #000;
+  border-width: 0px 0px 1px 1px;
+  border-collapse: separate;
+  border-spacing: 0px;
+  margin: 0.4em 0;
+}
+.threat_category_tooltip table td,
+.threat_category_tooltip table th {
+  border: solid #000;
+  border-width: 1px 1px 0px 0px;
+  padding: 0.1em 0.4em;
+}
+.threat_category_tooltip table td {
+  text-align: left;
+}
+.threat_category_tooltip table th {
+  text-align: center;
+}
+
+.threat_category_table table td,
+.threat_category_table table th {
+  border: solid #000;
+  border-width: 1px 1px 1px 1px;
+  padding: 0.1em 0.4em;
+}
+.threat_category_table table td {
+  text-align: left;
+}
+.threat_category_table table th {
+  text-align: center;
+}
+
 td.events {
   text-align: right;
   padding-right: 0;
diff --git a/NERDweb/templates/ip.html b/NERDweb/templates/ip.html
index 341e1c54..2d18039c 100644
--- a/NERDweb/templates/ip.html
+++ b/NERDweb/templates/ip.html
@@ -282,6 +282,23 @@ <h1>IP address</h1>
 {% endfor %}
 </dl>
 
+{# Threat category table #}
+{% if threat_category_table %}
+<p class="caption">Threat category</p>
+<div class="threat_category_table">
+<table>
+    <tr><th>Role</th><th>Category</th><th>Subcategory</th></tr>
+    {% for row in threat_category_table %}
+        <tr>
+        {% for col in row %}
+            <td>{{ col }}</td>
+        {% endfor %}
+        </tr>
+    {% endfor %}
+</table>
+</div>
+{% endif %}
+
 {# Event plot WARDEN -#}
 <p class="caption">Warden event timeline</p>
 <div class="chart-container" style="position: relative; width: 100%; height: 20em">
diff --git a/NERDweb/templates/ips.html b/NERDweb/templates/ips.html
index 87c8e23f..37ecd5d6 100644
--- a/NERDweb/templates/ips.html
+++ b/NERDweb/templates/ips.html
@@ -289,6 +289,7 @@ <h1>Search IP addresses by ...</h1>
   <th title="Reputation score (first experimental algorithm - takes into account number of events and number of detectors per day, from last 14 days with linearly decreasing weight by age)">Rep.<sup><a href="https://github.com/CESNET/NERD/wiki/Reputation-score" title="More information about reputation score" target="_blank">(?)</a></sup></th>
   {% if ac('fmp') %}<th title="Future Maliciousness Probability score">FMP</th>{% endif %}
   <th>Other properties</th>
+  <th>Threat category</th>
   <th>Time added</th>
   <th>Last activity</th>
   <th title="Links to external services">Links</th>
@@ -406,6 +407,10 @@ <h1>Search IP addresses by ...</h1>
       {% endfor %}
     {% endif %}
   </td>
+  <td class="threat_category" data-tctablepreview="{{ip._threat_category_table_preview}}">
+    <span>{{ip._threat_category_role}}</span>
+    <span>{{ip._threat_category_id}}</span>
+  </td>
   <td class="time" {% if ip.ts_added %}data-time={{ ip.ts_added|date_to_int }}{% endif %}>{{ip.ts_added.strftime("%Y-%m-%d %H:%M:%S") if ip.ts_added else "--"}}</td>
   <td {% if ip.last_activity %}class="time" data-time={{ ip.last_activity|date_to_int }}{% endif %}>{{ip.last_activity.strftime("%Y-%m-%d %H:%M:%S") if ip.last_activity else "--"}}</td>
   <td class="links">
diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index e02929c5..a19d00ef 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -1,15 +1,41 @@
 import yaml
+import ast
+from datetime import datetime
 
 
 class ClassifiableEvent:
     def __getattr__(self, name):
+        """
+        Override __getattr__ so that no error is raised when a module asks for a non-existing attribute
+        :param name: Name of the attribute
+        :return: Value of the attribute (or None if it does not exist)
+        """
         return self.__dict__[name] if name in self.__dict__ else None
 
+    def __str__(self):
+        """
+        Override __str__ for easier logging of assigned categories
+        :param name: Name of the attribute
+        :return: String representation of the object's attribute dictionary
+        """
+        return str(self.__dict__)
+
     def __init__(self, module_name=None, *args):
+        """
+        Initialize the event (fill metadata from source module)
+        :param module_name: Name of the attribute
+        :param *args: Module specific attributes (such as a list of protocols from Warden)
+        :return:
+        """
         init_fn = getattr(self, f"init_{module_name}")
         init_fn(*args)
 
     def init_warden_receiver(self, event, source):
+        """
+        Fill in metadata from a warden event
+        :param event: Source event
+        :return:
+        """
         self.categories = event.get('Category', [])
         self.source_types = source.get('Type', [])
         self.description = event.get("Description", "")
@@ -22,16 +48,101 @@ def init_warden_receiver(self, event, source):
         self.protocols = list(set(protocols))
 
     def init_otx_receiver(self, pulse):
+        """
+        Fill in metadata from an OTX pulse
+        :param pulse: Source pulse
+        :return:
+        """
         self.indicator_role = pulse.get('indicator_role', "")
         self.indicator_title = pulse.get('indicator_title', "")
-        self.n_reports = 1  # TODO
+        self.pulse_name = pulse.get('pulse_name', "")
 
-    def init_misp_receiver(self, event, ip_role):
+    def init_misp_receiver(self, event, attrib, ip_role):
+        """
+        Fill in metadata from a MISP event
+        :param event: Source event
+        :param attrib: Attribute with the source IP
+        :param ip_role: Role of the IP address (src/dst/both)
+        :return:
+        """
         self.tags = [tag["name"] for tag in event.get('tag_list', [])]
+        self.info = event.get('info', "")
+        self.attrib_comment = attrib.get('comment', "")
         self.ip_role = ip_role
+        try:
+            if attrib['type'] == "ip-dst|port":
+                split_attrib = attrib['value'].split('|')
+                if len(split_attrib) == 1:
+                    split_attrib = attrib['value'].split(':')
+                if len(split_attrib) > 1:
+                    self.target_ports = [int(split_attrib[1])]
+        except ValueError:
+            pass
+
+
+def eval_trigger(trigger, event):
+    """
+    Evaluate a category trigger, i.e. a statement that resolves to either True or False
+    :param trigger: Trigger to be evaluated
+    :param event: Source event (instance of ClassifiableEvent) from which the trigger reads data
+    :return: Result of the evaluation (True/False), dictionary with subcategory assignments
+    """
+    result = False
+    subcategories = {}
+    a = trigger.split("->")
+    if eval(a[0]) is True:
+        result = True
+    if len(a) > 1:
+        subcategories = ast.literal_eval(a[1].lstrip())
+    return result, subcategories
+
+
+def match_str(str_a, str_b):
+    """
+    Approximate (sub)string matching
+
+    Ignores character casing, whitespace and some special characters
+    """
+    simplified_a = str_a.strip().replace("_", "").replace(".", "").lower()
+    simplified_b = str_b.strip().replace("_", "").replace(".", "").lower()
+    return simplified_a in simplified_b
+
+
+def log_category(id, module, category, event):
+    """
+    Log assigned category
+    :param id: ID of the record (e.g. IP address or blacklist name)
+    :param module: Name of the source module
+    :param category: Assigned category
+    :param event: Source event (instance of ClassifiableEvent)
+    :return:
+    """
+    with open(f"/var/log/nerd/threat_categorization_{module}.log", "a+") as logfile:
+        logfile.write(f"{datetime.now()}\n")
+        logfile.write(f"ID: {id}\n")
+        logfile.write(f"Category: {category}\n")
+        logfile.write(f"Event: {event}\n")
+        logfile.write("===============================================\n")
+
+
+def load_malware_families():
+    """
+    Load the list of malware families downloaded from Malpedia
+    :return:
+    """
+    try:
+        with open("/data/malpedia/malware_families.yml", "r") as f:
+            return yaml.safe_load(f)
+    except Exception:
+        return {}
 
 
 def load_categorization_config(module_name=None):
+    """
+    Load categorization configuration for a specific module
+    :param module_name: Name of the source module
+    :return: Dictionary containing categorization config
+    """
     categories = {}
     categorization_config = yaml.safe_load(open("/etc/nerd/threat_categorization.yml"))
     for category_id, category_config in categorization_config.items():
@@ -39,7 +150,6 @@ def load_categorization_config(module_name=None):
             "label": category_config.get("label", ""),
             "role": category_config.get("role", "src"),
             "subcategories": category_config.get("subcategories", []),
-            "triggers": category_config.get("triggers", {}).get(module_name, "False").split("\n"),
-            "blacklists": category_config.get("blacklists", [])
+            "triggers": category_config.get("triggers", {}).get(module_name, "False").split("\n")
         }
     return categories
diff --git a/etc/threat_categorization.yml b/etc/threat_categorization.yml
index 471e5a70..700e2380 100644
--- a/etc/threat_categorization.yml
+++ b/etc/threat_categorization.yml
@@ -15,6 +15,7 @@ scan:
       'Recon.Scanning' in event.categories
     otx_receiver: |-
       event.indicator_role == 'scanning_host'
+      event.pulse_name == 'Webscanners 2018-02-09 thru current day'
     misp_receiver: |-
       'CERT-XLM:information-gathering="scanner"' in event.tags
       'ecsirt:information-gathering="scanner"' in event.tags
@@ -30,20 +31,31 @@ bruteforce:
   triggers:
     warden_receiver: |-
       'Attempt.Login' in event.categories
+      'Intrusion.UserCompromise' in event.categories
+      'SSH login' in event.description -> {'protocol': ['ssh']}
     otx_receiver: |-
       event.indicator_role == 'bruteforce'
+      'Telnet Login attempt' in event.indicator_title -> {'protocol': ['telnet']}
+      'Telnet honeypot logs' in event.pulse_name -> {'protocol': ['telnet']}
+      'SSH honeypot logs' in event.pulse_name -> {'protocol': ['ssh']}
+      'RDP honeypot logs' in event.pulse_name -> {'protocol': ['rdp']}
+      'VNC honeypot logs' in event.pulse_name
+      'Redis honeypot logs' in event.pulse_name
+      'PostgresQL honeypot logs' in event.pulse_name
+      'SSH intrusion attempt' in event.indicator_title -> {'protocol': ['ssh']}
+      'RDP intrusion attempt' in event.indicator_title -> {'protocol': ['rdp']}
     misp_receiver: |-
       'CERT-XLM:intrusion-attempts="login-attempts"' in event.tags
       'ecsirt:intrusion-attempts="brute-force"' in event.tags
-  blacklists:
-    - bruteforceblocker
-    - blocklist_de-ssh
-    - blocklist_de-bruteforcelogin
-    - charles_the_haleys_ssh_dico_ips
-    - charles_the_haleys_smtp_dico_ips
-    - dataplane_org_sshclient
-    - dataplane_org_sshpwauth
-    - dataplane_org_telnet_login
+    blacklists: |-
+      bruteforceblocker
+      blocklist_de-ssh -> {'protocol': ['ssh']}
+      blocklist_de-bruteforcelogin
+      charles_the_haleys_ssh_dico_ips -> {'protocol': ['ssh']}
+      charles_the_haleys_smtp_dico_ips -> {'protocol': ['smtp']}
+      dataplane_org_sshclient -> {'protocol': ['ssh']}
+      dataplane_org_sshpwauth -> {'protocol': ['ssh']}
+      dataplane_org_telnet_login -> {'protocol': ['telnet']}
 
 ddos:
   role: src
@@ -51,6 +63,7 @@ ddos:
   label: DDoS
   triggers:
     warden_receiver: |-
+      'DoS anomalies' in event.description
       'Availability.DoS' in event.categories
       'Availability.DDoS' in event.categories
     misp_receiver: |-
@@ -73,9 +86,9 @@ ddos-amplifier:
       'Vulnerable.Config' in event.categories and 'ntp' in event.protocols
       'Vulnerable.Config' in event.categories and 'memcached' in event.protocols
       'Backscatter' in event.source_types
-      'Open DNS Resolver' in event.description
-      'Open Memcached' in event.description
-      'Abusable NTP' in event.description
+      'Open DNS Resolver' in event.description -> {'protocol': ['dns']}
+      'Open Memcached' in event.description -> {'protocol': ['memcached']}
+      'Abusable NTP' in event.description -> {'protocol': ['ntp']}
 
 spam:
   role: src
@@ -90,10 +103,10 @@ spam:
       'CERT-XLM:abusive-content="spam"' in event.tags
       'ecsirt:abusive-content="spam"' in event.tags
       'circl:incident-classification="spam"' in event.tags
-  blacklists:
-    - sblam_ips
-    - psbl
-    - spamhaus_edrop
+    blacklists: |-
+      sblam_ips
+      psbl
+      spamhaus_edrop
 
 malware_distribution:
   role: dst
@@ -103,28 +116,31 @@ malware_distribution:
     - malware_family
   triggers:
     warden_receiver: |-
+      'Malware' in event.source_types
+      'Malware' in event.categories
       'Malware.Virus' in event.categories
       'Malware.Worm' in event.categories
       'Malware.Trojan' in event.categories
       'Malware.Spyware' in event.categories
       'Malware.Dialer' in event.categories
       'Malware.Rootkit' in event.categories
-      'Malware' in event.source_types
       'OriginMalware' in event.source_types
-      'OriginSandbox' in event.source_types
     otx_receiver: |-
       event.indicator_role == 'trojan'
       event.indicator_role == 'malware_hosting'
     misp_receiver: |-
-      'MALWARE' in event.tags
-      'keylogger/infostealer' in event.tags
-      'Keylogger' in event.tags
-      'infostealer' in event.tags
-      'Ransomware' in event.tags
-      'Remote Access Trojan' in event.tags
-      'MalSpam' in event.tags
-  blacklists:
-    - urlhouse_ips
+      'Malware download' in event.attrib_comment
+      'payload_delivery' in event.attrib_comment
+      'MALWARE' in event.tags and 'dst' in event.ip_role
+      'Keylogger' in event.tags and 'dst' in event.ip_role
+      'infostealer' in event.tags and 'dst' in event.ip_role
+      'Ransomware' in event.tags and 'dst' in event.ip_role
+      'Remote Access Trojan' in event.tags and 'dst' in event.ip_role
+      'MalSpam' in event.tags and 'dst' in event.ip_role
+      'circl:incident-classification="malware"' in event.tags and 'dst' in event.ip_role
+      'ecsirt:malicious-code="malware"' in event.tags and 'dst' in event.ip_role
+    blacklists: |-
+      urlhouse_ips
 
 cc:
   role: dst
@@ -137,12 +153,15 @@ cc:
       'CC' in event.source_types
     otx_receiver: |-
       event.indicator_role == 'command_and_control'
+      'Command and Control' in event.indicator_title
     misp_receiver: |-
-      'kill-chain:Command and Control' in event.tags
-      'ecsirt:malicious-code="c&c"' in event.tags
-  blacklists:
-    - feodo
-    - bambenek_c2
+      'botnet_cc' in event.attrib_comment
+      'C2 server' in event.attrib_comment
+      'kill-chain:Command and Control' in event.tags and 'dst' in event.ip_role
+      'ecsirt:malicious-code="c&c"' in event.tags and 'dst' in event.ip_role
+    blacklists: |-
+      feodo
+      bambenek_c2
 
 botnet_drone:
   role: src
@@ -157,8 +176,8 @@ botnet_drone:
     misp_receiver: |-
       'CERT-XLM:intrusion="botnet-member"' in event.tags
       'ecsirt:malicious-code="botnet-drone"' in event.tags
-  blacklists:
-    - mirai_tracker_ips
+    blacklists: |-
+      mirai_tracker_ips -> {'malware_family': ['elf.mirai']}
 
 phishing_site:
   role: dst
@@ -176,8 +195,8 @@ phishing_site:
       'circl:incident-classification="phishing"' in event.tags
       'circl:incident-classification="whaling"' in event.tags
       'circl:incident-classification="smishing"' in event.tags
-  blacklists:
-    - openphish
+    blacklists: |-
+      openphish
 
 exploit:
   role: src
@@ -189,6 +208,7 @@ exploit:
     warden_receiver: |-
       'Attempt.Exploit' in event.categories
     otx_receiver: |-
+      'Apache honeypot logs' in event.pulse_name -> {'protocol': ['http']}
       event.indicator_role == 'exploit_source'
       event.indicator_role == 'exploit_kit'
     misp_receiver: |-
diff --git a/scripts/download_malpedia_families.py b/scripts/download_malpedia_families.py
new file mode 100644
index 00000000..d00301c6
--- /dev/null
+++ b/scripts/download_malpedia_families.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+# Download Malpedia's list of malware families to /data/malpedia/malware_families.yml
+
+import requests
+import yaml
+import json
+import os
+
+url = "https://malpedia.caad.fkie.fraunhofer.de/api/get/families"
+response = requests.get(url)
+data = json.loads(response.content)
+output = {}
+output_dir = "/data/malpedia/"
+
+for family_id, family_data in data.items():
+    name = family_data.get("common_name", "")
+    if name == "":
+        try:
+            name = family_id.split('.')[1]
+        except Exception:
+            name = family_id
+    output[family_id] = {
+        "common_name": name,
+        "description": family_data.get("description", ""),
+        "url": f"https://malpedia.caad.fkie.fraunhofer.de/details/{family_id}"
+    }
+
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+with open(f"{output_dir}/malware_families.yml", "w+") as outfile:
+    yaml.dump(output, outfile, default_flow_style=False)

From 3e86cb074a5641daf82e768893985903573b7daa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Wed, 29 Nov 2023 10:59:02 +0100
Subject: [PATCH 03/21] Threat categorization improvements:  - Source modules
 add date to category records (+cleaning of old data)  - New summary module  -
 Web updates  - Updated configuration  - Various bug fixes

---
 NERDd/blacklists.py                      |  42 ++-
 NERDd/dshield.py                         |   2 +-
 NERDd/misp_receiver.py                   |  72 ++--
 NERDd/modules/cleaner.py                 |  31 ++
 NERDd/modules/threat_category_summary.py | 108 ++++++
 NERDd/otx_receiver.py                    |  53 +--
 NERDd/warden_receiver.py                 |  65 +---
 NERDd/worker.py                          |   4 +-
 NERDweb/nerd_main.py                     |  85 ++---
 NERDweb/static/main.js                   |  28 +-
 NERDweb/static/style.css                 |  38 +-
 NERDweb/templates/ip.html                |  22 +-
 NERDweb/templates/ips.html               |  17 +-
 common/threat_categorization.py          | 149 ++++----
 etc/threat_categorization.yml            | 456 ++++++++++++-----------
 scripts/misp_updater.py                  |  56 ++-
 16 files changed, 683 insertions(+), 545 deletions(-)
 create mode 100644 NERDd/modules/threat_category_summary.py

diff --git a/NERDd/blacklists.py b/NERDd/blacklists.py
index c771b554..bbc71fde 100755
--- a/NERDd/blacklists.py
+++ b/NERDd/blacklists.py
@@ -43,24 +43,21 @@
 ###############################################################################
 # Threat categorization
 
-categorization_config = None
 blacklist_to_category = None
 
 
-def categorization_init():
+def categorization_init(categorization_config):
     """
-    Create a blacklist -> category mapping based on the categorization config in '/etc/threat_categorization.yml'
+    Create a blacklist -> category mapping based on the categorization config in 'etc/threat_categorization.yml'
 
     :return:
     """
-    global categorization_config
     global blacklist_to_category
 
-    categorization_config = load_categorization_config("blacklists")
-
     blacklist_to_category = {}
-    for category_id, category_config in categorization_config.items():
-        for line in category_config.get("triggers", []):
+    for category_id, category_config in categorization_config["categories"].items():
+        category_triggers = category_config.get("triggers", {}).get("blacklists", "").split("\n")
+        for line in category_triggers:
             split_line = line.split("->")
             blacklist_id = split_line[0]
             subcategories = {}
@@ -69,17 +66,16 @@ def categorization_init():
             blacklist_to_category[blacklist_id] = {"id": category_id, "role": category_config["role"], "subcategories": subcategories}
 
 
-def classify_blacklist(blacklist_id):
+def classify_blacklist(blacklist_id, categorization_config):
     """
     Assign a threat category based on the blacklist -> category mapping created by categorization_init()
 
     :return: Assigned category
     """
-    global categorization_config
     global blacklist_to_category
 
-    if categorization_config is None:
-        categorization_init()
+    if blacklist_to_category is None:
+        categorization_init(categorization_config)
 
     if blacklist_id in blacklist_to_category:
         return blacklist_to_category[blacklist_id]
@@ -184,7 +180,7 @@ def download_blacklist(blacklist_url, params=None):
         return ""
 
 
-def get_blacklist(id, name, url, regex, bl_type, life_length, params):
+def get_blacklist(id, name, url, regex, bl_type, life_length, params, categorization_config):
     """
     Download the blacklist, parse all its records, and create worker task for each IP in current blacklist.
     :param id: id of the blacklist
@@ -206,19 +202,18 @@ def get_blacklist(id, name, url, regex, bl_type, life_length, params):
 
     log.info("{} IPs found in '{}', sending tasks to NERD workers".format(len(bl_records), id))
 
-    category = classify_blacklist(id)
+    category = classify_blacklist(id, categorization_config)
     subcategory_updates = []
     for subcategory, values in category['subcategories'].items():
         subcategory_updates.append(('extend_set', subcategory, values))
-    log_category(id, "blacklists", category, None)
 
     for ip in bl_records:
         task_queue_writer.put_task('ip', ip, [
             ('setmax', '_ttl.bl', now_plus_life_length),
             ('array_upsert', 'bl', {'n': id},
                 [('set', 'v', 1), ('set', 't', download_time), ('append', 'h', download_time)]),
-            ('array_upsert', 'threat_category', {'id': category["id"], 'role': category["role"]},
-                [('add_to_set', 'n_reports.blacklists', id)], *subcategory_updates)
+            ('array_upsert', '_threat_category', {'date': download_time.strftime("%Y-%m-%d"), 'id': category["id"], 'role': category["role"]},
+                [('add', 'n_reports.blacklists', 1)], *subcategory_updates)
         ], "blacklists")
 
 
@@ -271,6 +266,15 @@ def stop(signal, frame):
     log.info("Loading config file {}".format(common_cfg_file))
     config.update(common.config.read_config(common_cfg_file))
 
+    # Read categorization config
+    categorization_cfg_file = os.path.join(config_base_path, 'threat_categorization.yml')
+    log.info("Loading config file {}".format(categorization_cfg_file))
+    config.update(common.config.read_config(categorization_cfg_file))
+    categorization_config = {
+        "categories": config.get('threat_categorization'),
+        "malware_families": common.config.read_config(config.get('malpedia_family_list_path'))
+    }
+
     rabbit_config = config.get("rabbitmq")
 
     # Get number of processes from config
@@ -304,7 +308,7 @@ def stop(signal, frame):
             assert isinstance(other_params, dict), "The additional parameter must be a dict (in config of {}.{})".format(
                 config_path, id)
             # Process the blacklist
-            get_blacklist(id, name, url, regex, bl_type, life_length, other_params)
+            get_blacklist(id, name, url, regex, bl_type, life_length, other_params, categorization_config)
 
     # Schedule periodic processing...
     if not args.one_shot:
@@ -321,7 +325,7 @@ def stop(signal, frame):
             other_params = bl.get('params', {})
 
             trigger = CronTrigger(**refresh_time)
-            job = scheduler.add_job(get_blacklist, args=(id, name, url, regex, bl_type, life_length, other_params),
+            job = scheduler.add_job(get_blacklist, args=(id, name, url, regex, bl_type, life_length, other_params, categorization_config),
                                     trigger=trigger, coalesce=True, max_instances=1)
 
             log.info("{} blacklist '{}' scheduled to be downloaded at every: {}".format(
diff --git a/NERDd/dshield.py b/NERDd/dshield.py
index 6212f3eb..251c0e61 100644
--- a/NERDd/dshield.py
+++ b/NERDd/dshield.py
@@ -93,7 +93,7 @@ def process_feed(feed_data):
              [('set', 'reports', ips[ip_addr]["reports"]),
               ('set', 'targets', ips[ip_addr]["targets"])]),
             ('setmax', '_ttl.dshield', ttl_date),
-            ('array_upsert', 'threat_category', {'id': 'scan', 'role': 'src'}, [('set', 'n_reports.dshield', ips[ip_addr]["reports"])])
+            ('array_upsert', '_threat_category', {'date': date_str, 'id': 'scan', 'role': 'src'}, [('set', 'n_reports.dshield', ips[ip_addr]["reports"])])
         ], "dshield")
     logger.info("Tasks created")
 
diff --git a/NERDd/misp_receiver.py b/NERDd/misp_receiver.py
index 5fc41072..00156c5f 100644
--- a/NERDd/misp_receiver.py
+++ b/NERDd/misp_receiver.py
@@ -62,6 +62,15 @@
 logger.info("Loading config file {}".format(common_cfg_file))
 config.update(read_config(common_cfg_file))
 
+# Read categorization config
+categorization_cfg_file = os.path.join(config_base_path, 'threat_categorization.yml')
+logger.info("Loading config file {}".format(categorization_cfg_file))
+config.update(read_config(categorization_cfg_file))
+categorization_config = {
+    "categories": config.get('threat_categorization'),
+    "malware_families": read_config(config.get('malpedia_family_list_path'))
+}
+
 inactive_ip_lifetime = config.get('record_life_length.misp', 180)
 
 rabbit_config = config.get("rabbitmq")
@@ -99,47 +108,6 @@
 IP_MISP_TYPES = ["ip-src", "ip-dst", "ip-dst|port", "ip-src|port", "domain|ip"]
 THREAT_LEVEL_DICT = {'1': "High", '2': "Medium", '3': "Low", '4': "Undefined"}
 
-##############################################################################
-# Threat categorization
-
-categorization_config = None
-malware_families = None
-
-def classify_ip(ip_addr, event_data, attrib, ip_role):
-    """
-    Assign a threat category based on the information provided in the incoming event
-
-    :return: List of assigned categories
-    """
-    global categorization_config
-    global malware_families
-    if categorization_config is None:
-        categorization_config = load_categorization_config("misp_receiver")
-        malware_families = load_malware_families()
-
-    output = []
-    event = ClassifiableEvent("misp_receiver", event_data, attrib, ip_role)
-    for category_id, category_config in categorization_config.items():
-        for trigger in category_config["triggers"]:
-            result, subcategories = eval_trigger(trigger, event)
-            if result is True:
-                if "port" in category_config["subcategories"]:
-                    ports_from_config = subcategories.get("port", [])
-                    if event.target_ports or ports_from_config:
-                        subcategories["port"] = list(set(event.target_ports + ports_from_config))
-                if "malware_family" in category_config["subcategories"]:
-                    for family_id, family_data in malware_families.items():
-                        if match_str(family_data["common_name"], event.attrib_comment) or \
-                           match_str(family_data["common_name"], event.info):
-                            if "malware_family" not in subcategories:
-                                subcategories["malware_family"] = [family_id]
-                            else:
-                                subcategories["malware_family"].append(family_id)
-                output.append({"id": category_id, "role": category_config["role"], "subcategories": subcategories})
-    if not output:
-        output.append({"id": "unknown", "role": "src", "subcategories": {}})
-    log_category(ip_addr, "misp_receiver", output, event)
-    return output
 
 ##############################################################################
 # Main module code
@@ -326,14 +294,14 @@ def upsert_new_event(event, attrib, sighting_list, role=None):
     ]
 
     # threat categorization updates
-    for category_data in classify_ip(ip_addr, new_event, attrib, ip_role):
+    for category_data in classify_ip(ip_addr, "misp_receiver", logger, categorization_config, new_event, attrib, ip_role):
         subcategory_updates = []
         for subcategory, values in category_data['subcategories'].items():
             subcategory_updates.append(('extend_set', subcategory, values))
         updates.append((
             'array_upsert',
-            'threat_category',
-            {'id': category_data['id'], 'role': category_data['role']},
+            '_threat_category',
+            {'date': category_data['date'], 'id': category_data['id'], 'role': category_data['role']},
             [('add', 'n_reports.misp_receiver', 1), *subcategory_updates]
         ))
 
@@ -475,9 +443,21 @@ def process_edit_of_attribute(json_message):
 def process_new_attribute(json_message):
     # change looks like: "to_ids () => (1), distribution () => (5), type () => (hostname)..."
     attrib = json_message['Log']['change']
-    attrib_type = re_attrib_type_change.search(attrib).group(1)
+    try:
+        attrib_type = re_attrib_type_change.search(attrib).group(1)
+    except AttributeError:
+        logger.error("Error", exc_info=True)
+        logger.error("Used regex: " + re_attrib_type_change.pattern)
+        logger.error("Searched text: " + attrib)
+        return
     if attrib_type in IP_MISP_TYPES:
-        event_id = re_event_id_change.search(json_message['Log']['change']).group(1)
+        try:
+            event_id = re_event_id_change.search(attrib).group(1)
+        except AttributeError:
+            logger.error("Error", exc_info=True)
+            logger.error("Used regex: " + re_attrib_type_change.pattern)
+            logger.error("Searched text: " + attrib)
+            return
         attrib_id = json_message['Log']['model_id']
         try:
             attrib_value = re_attrib_type_value_title.search(json_message['Log']['title']).group(2)
diff --git a/NERDd/modules/cleaner.py b/NERDd/modules/cleaner.py
index 2b1122f7..5422605c 100644
--- a/NERDd/modules/cleaner.py
+++ b/NERDd/modules/cleaner.py
@@ -47,6 +47,13 @@ def __init__(self):
             ('!every1d',),
             tuple() # No key is changed; some are removed, but there's no way to specify list of keys to delete in advance; anyway it shouldn't be a problem in this case.
         )
+        g.um.register_handler(
+            self.clear_threat_category,
+            'ip',
+            ('!every1d',),
+            tuple()
+            # No key is changed; some are removed, but there's no way to specify list of keys to delete in advance; anyway it shouldn't be a problem in this case.
+        )
         g.um.register_handler(
             self.check_ip_expiration,
             'ip',
@@ -170,6 +177,30 @@ def clear_otx_pulses(self, ekey, rec, updates):
 
         return actions
 
+    def clear_threat_category(self, ekey, rec, updates):
+        """
+        Handler function to clear old threat category data
+        Remove all items under threat_category with "date" older then current
+        day minus 'max_event_history' days.
+        """
+        etype, key = ekey
+        if etype != 'ip':
+            return None
+
+        today = datetime.utcnow().date()
+        cut_day = (today - self.max_event_history).strftime("%Y-%m-%d")
+
+        # Remove all threat category records with day before cut_day
+        actions = []
+        for category_record in rec.get('_threat_category', []):
+            if category_record['date'] < cut_day:  # Thanks to ISO format it's OK to compare dates as strings
+                actions.append(('array_remove', '_threat_category', {'date': category_record['date'], 'id': category_record['id'], 'role': category_record['role']}))
+
+        if actions:
+            self.log.debug("Cleaning {}: Removing {} old threat category records".format(key, len(actions) - 1))
+
+        return actions
+
     def check_ip_expiration(self, ekey, rec, updates):
         """
         Check record's TTL tokens, and either issue normal !every1d or delete the record.
diff --git a/NERDd/modules/threat_category_summary.py b/NERDd/modules/threat_category_summary.py
new file mode 100644
index 00000000..9166536b
--- /dev/null
+++ b/NERDd/modules/threat_category_summary.py
@@ -0,0 +1,108 @@
+"""
+NERD module summarizing threat category records.
+
+Should be triggered at least once a day for every address.
+"""
+
+from core.basemodule import NERDModule
+import g
+
+from copy import deepcopy
+import datetime
+
+
+def nonlin(val, coef=0.5, max=20):
+    """Nonlinear transformation of [0,inf) to [0,1)"""
+    if val > max:
+        return 1.0
+    else:
+        return (1 - coef**val)
+
+
+class ThreatCategorySummary(NERDModule):
+    """
+    Module summarizing threat category records.
+    """
+
+    def __init__(self):
+        g.um.register_handler(
+            self.create_summary,  # function (or bound method) to call
+            'ip',  # entity type
+            ('_threat_category', '!every1d'),  # tuple/list/set of attributes to watch (their update triggers call of the registered method)
+            ('_threat_category_summary',)  # tuple/list/set of attributes the method may change
+        )
+
+    def create_summary(self, ekey, rec, updates):
+        """
+        Summarize threat caregory records - group records by category
+                                          - get total number of reports for each source module
+                                          - compute confidence
+
+        Category confidence (based on reputation score method):
+        - take list of records from last 14 days
+        - compute a "daily confidence" for each day as:
+          - nonlin(num_of_reports) * nonlin(number_of_sources)
+          - where nonlin is a nonlinear transformation: 1 - 1/2^x
+        - get total confidence as weighted average of all "daily" ones with
+          linearly decreasing weight (weight = (14-n)/14 for n=0..13)
+        """
+        etype, key = ekey
+        if etype != 'ip':
+            return None
+
+        if '_threat_category' not in rec:
+            return None # No threat category records, nothing to do
+
+        grouped_by_category = {}
+        for record in rec['_threat_category']:
+            cat = record['id']
+            if cat not in grouped_by_category:
+                grouped_by_category[cat] = []
+            grouped_by_category[cat].append(record)
+
+        today = datetime.datetime.utcnow().date()
+        DATE_RANGE = 14
+        summary = []
+
+        for cat, records in grouped_by_category.items():
+            cat_summary = {
+                'role': records[0]['role'],
+                'id': records[0]['id'],
+                'sources': {},
+                'subcategories': {}
+            }
+            sources = {}
+            subcategories = {}
+            sum_weight = 0
+            confidence = 0
+            for record in deepcopy(records):
+                date = record['date']
+                date = datetime.date(int(date[0:4]), int(date[5:7]), int(date[8:10]))
+                record_age_days = (today - date).days
+                if record_age_days >= DATE_RANGE:
+                    continue
+                daily_reports = 0
+                for source in record['n_reports']:
+                    if source not in sources:
+                        sources[source] = 0
+                    sources[source] += record['n_reports'][source]
+                    daily_reports += record['n_reports'][source]
+                daily_confidence = nonlin(daily_reports) * nonlin(len(record['n_reports']))
+                weight = float(DATE_RANGE - record_age_days) / DATE_RANGE
+                sum_weight += weight
+                confidence += daily_confidence * weight
+                del record['date']
+                del record['role']
+                del record['id']
+                del record['n_reports']
+                for key, values in record.items():
+                    if key not in subcategories:
+                        subcategories[key] = set()
+                    subcategories[key].update(values)
+            if confidence > 0:
+                cat_summary['confidence'] = round(confidence / sum_weight, 2)
+                cat_summary['sources'] = sources
+                cat_summary['subcategories'] = {k: list(v) for k, v in subcategories.items()}
+                summary.append(cat_summary)
+        summary = sorted(summary, key=lambda rec: rec['confidence'], reverse=True)
+        return [('set', '_threat_category_summary', summary)]
diff --git a/NERDd/otx_receiver.py b/NERDd/otx_receiver.py
index c6e257cd..ad1e6ac6 100644
--- a/NERDd/otx_receiver.py
+++ b/NERDd/otx_receiver.py
@@ -80,6 +80,15 @@ def parse_datetime(time_str):
 logger.info("Loading config file {}".format(common_cfg_file))
 config.update(read_config(common_cfg_file))
 
+# Read categorization config
+categorization_cfg_file = os.path.join(config_base_path, 'threat_categorization.yml')
+logger.info("Loading config file {}".format(categorization_cfg_file))
+config.update(read_config(categorization_cfg_file))
+categorization_config = {
+    "categories": config.get('threat_categorization'),
+    "malware_families": read_config(config.get('malpedia_family_list_path'))
+}
+
 inactive_pulse_time = config.get('record_life_length.otx', 30)
 
 otx_api_key = config.get('otx_api_key', None)
@@ -99,44 +108,6 @@ def parse_datetime(time_str):
 
 scheduler = BlockingScheduler(timezone='UTC')
 
-# Threat categorization
-categorization_config = None
-malware_families = None
-
-
-def classify_ip(ip_addr, pulse):
-    """
-    Assign a threat category based on the information provided in the incoming event
-
-    :return: List of assigned categories
-    """
-    global categorization_config
-    global malware_families
-    if categorization_config is None:
-        categorization_config = load_categorization_config("otx_receiver")
-        malware_families = load_malware_families()
-
-    output = []
-    event = ClassifiableEvent("otx_receiver", pulse)
-
-    for category_id, category_config in categorization_config.items():
-        for trigger in category_config["triggers"]:
-            result, subcategories = eval_trigger(trigger, event)
-            if result is True:
-                if "malware_family" in category_config["subcategories"]:
-                    for family_id, family_data in malware_families.items():
-                        if match_str(family_data["common_name"], event.indicator_title) or \
-                                match_str(family_data["common_name"], event.pulse_name):
-                            if "malware_family" not in subcategories:
-                                subcategories["malware_family"] = [family_id]
-                            else:
-                                subcategories["malware_family"].append(family_id)
-                output.append({"id": category_id, "role": category_config["role"], "subcategories": subcategories})
-    if not output:
-        output.append({"id": "unknown", "role": "src", "subcategories": {}})
-    log_category(ip_addr, "otx_receiver", output, event)
-    return output
-
 
 def create_new_pulse(pulse, indicator):
     """
@@ -186,14 +157,14 @@ def upsert_new_pulse(pulse, indicator):
     ]
 
     # threat categorization updates
-    for category_data in classify_ip(ip_addr, new_pulse):
+    for category_data in classify_ip(ip_addr, "otx_receiver", logger, categorization_config, new_pulse):
         subcategory_updates = []
         for subcategory, values in category_data['subcategories'].items():
             subcategory_updates.append(('extend_set', subcategory, values))
         updates.append((
             'array_upsert',
-            'threat_category',
-            {'id': category_data['id'], 'role': category_data['role']},
+            '_threat_category',
+            {'date': category_data['date'], 'id': category_data['id'], 'role': category_data['role']},
             [('add', 'n_reports.otx_receiver', 1), *subcategory_updates]
         ))
 
diff --git a/NERDd/warden_receiver.py b/NERDd/warden_receiver.py
index 6e8c6c8f..1546c341 100644
--- a/NERDd/warden_receiver.py
+++ b/NERDd/warden_receiver.py
@@ -397,51 +397,6 @@ def should_pass(self, idea_message):
             # if no rule matched, then do default action
             return self.default_action()
 
-##############################################################################
-# Threat categorization
-
-categorization_config = None
-malware_families = None
-
-def classify_ip(ip_addr, event_data, source_data):
-    """
-    Assign a threat category based on the information provided in the incoming event
-
-    :return: List of assigned categories
-    """
-    global categorization_config
-    global malware_families
-    if categorization_config is None:
-        categorization_config = load_categorization_config("warden_receiver")
-        malware_families = load_malware_families()
-
-    output = []
-    event = ClassifiableEvent("warden_receiver", event_data, source_data)
-
-    for category_id, category_config in categorization_config.items():
-        for trigger in category_config["triggers"]:
-            result, subcategories = eval_trigger(trigger, event)
-            if result is True:
-                if "port" in category_config["subcategories"]:
-                    ports_from_config = subcategories.get("port", [])
-                    if event.target_ports or ports_from_config:
-                        subcategories["port"] = list(set(event.target_ports + ports_from_config))
-                if "protocol" in category_config["subcategories"]:
-                    protocols_from_config = subcategories.get("protocol", [])
-                    if event.protocols or protocols_from_config:
-                        subcategories["protocol"] = list(set(event.protocols + protocols_from_config))
-                if "malware_family" in category_config["subcategories"]:
-                    for family_id, family_data in malware_families.items():
-                        if match_str(family_data["common_name"], event.description):
-                            if "malware_family" not in subcategories:
-                                subcategories["malware_family"] = [family_id.lower()]
-                            else:
-                                subcategories["malware_family"].append(family_id.lower())
-                output.append({"id": category_id, "role": category_config["role"], "subcategories": subcategories})
-    if not output:
-        output.append({"id": "unknown", "role": "src", "subcategories": {}})
-    log_category(ip_addr, "warden_receiver", output, event)
-    return output
 
 ##############################################################################
 # Main module code
@@ -483,7 +438,7 @@ def stop(signal, frame):
     log.info("exiting")
 
 
-def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime, warden_filter=None):
+def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime, categorization_config, warden_filter=None):
     # Infinite loop reading events as files in given directory
     # This loop stops on SIGINT
     log.info("Reading IDEA files from {}/incoming".format(filer_path))
@@ -532,14 +487,14 @@ def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime,
                     ]
 
                     # threat categorization updates
-                    for category_data in classify_ip(ipv4, event, src):
+                    for category_data in classify_ip(ipv4, "warden_receiver", log, categorization_config, event, src):
                         subcategory_updates = []
                         for subcategory, values in category_data['subcategories'].items():
                             subcategory_updates.append(('extend_set', subcategory, values))
                         updates.append((
                             'array_upsert',
-                            'threat_category',
-                            {'id': category_data['id'], 'role': category_data['role']},
+                            '_threat_category',
+                            {'date': category_data['date'], 'id': category_data['id'], 'role': category_data['role']},
                             [('add', 'n_reports.warden_receiver', 1), *subcategory_updates]
                         ))
 
@@ -578,10 +533,20 @@ def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime,
     log.info("Loading config file {}".format(common_cfg_file))
     config.update(common.config.read_config(common_cfg_file))
 
+    # Read categorization config
+    categorization_cfg_file = os.path.join(config_base_path, 'threat_categorization.yml')
+    log.info("Loading config file {}".format(categorization_cfg_file))
+    config.update(common.config.read_config(categorization_cfg_file))
+
     inactive_ip_lifetime = config.get('record_life_length.warden', 14)
     warden_filter_rules = config.get('warden_filter', None)
     rabbit_config = config.get("rabbitmq")
     filer_path = config.get('warden_filer_path')
+    categorization_config = {
+        "categories": config.get('threat_categorization'),
+        "malware_families": common.config.read_config(config.get('malpedia_family_list_path'))
+    }
+
 
     if warden_filter_rules:
         try:
@@ -606,4 +571,4 @@ def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime,
     task_queue_writer.connect()
 
     signal.signal(signal.SIGINT, stop)
-    receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime, warden_filter)
+    receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime, categorization_config, warden_filter)
diff --git a/NERDd/worker.py b/NERDd/worker.py
index 6fb410c5..fd80fe5f 100755
--- a/NERDd/worker.py
+++ b/NERDd/worker.py
@@ -123,7 +123,8 @@ def main(cfg_file, process_index):
     import modules.fmp
     import modules.reserved_ip
     import modules.ttl_updater
-    
+    import modules.threat_category_summary
+
     # Instantiate modules
     # TODO create all modules automatically (loop over all modules.* and find all objects derived from NERDModule)
     #  or take if from configuration
@@ -149,6 +150,7 @@ def main(cfg_file, process_index):
         modules.fmp.FMP(),
         modules.reserved_ip.ReservedIPTags(),
         modules.ttl_updater.TTLUpdater(),
+        modules.threat_category_summary.ThreatCategorySummary(),
     ]
     
     
diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index a8a9098f..34613fbb 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -70,8 +70,9 @@
 p_bl_config = common.config.read_config(p_bl_cfg_file)
 dnsbl_config = common.config.read_config(dnsbl_cfg_file)
 
-# Threat categorization config
-threat_categorization_config = common.threat_categorization.load_categorization_config()
+# Read threat categorization config
+categorization_cfg_file = os.path.join(cfg_dir, 'threat_categorization.yml')
+threat_categorization_config = common.config.read_config(categorization_cfg_file)["threat_categorization"]
 
 # Dict: blacklist_id -> parameters
 #  parameters should contain:
@@ -878,7 +879,7 @@ def __init__(self, *args, **kwargs):
 
         # Load categorization config to get list of all categories
         self.tc_role.choices = [("src", "Source"), ("dst", "Destination")]
-        self.tc_category.choices = [(cat_id, cat_data['label']) for cat_id, cat_data in threat_categorization_config.items()]
+        self.tc_category.choices = sorted([(cat_id, cat_data['label']) for cat_id, cat_data in threat_categorization_config.items()])
 
         # Number of occurrences for blacklists (list of blacklists is taken from configuration)
         bl_name2num = {item['_id']: int(item['n']) for item in mongo.db.n_ip_by_bl.find()}
@@ -963,7 +964,7 @@ def create_query(form):
     if form.tc_role.data or form.tc_category.data or form.tc_subcategory.data:
         if form.tc_role.data and len(form.tc_role.data) > 1 and form.tc_role_op.data == "and":
             for role in form.tc_role.data:
-                queries.append({"threat_category": {"$elemMatch": {"role": role}}})
+                queries.append({"_threat_category": {"$elemMatch": {"role": role}}})
         else:
             query = {}
             elem_match = {}
@@ -977,9 +978,9 @@ def create_query(form):
                 elem_match.update({subcategory_id: subcategory_value})
             if form.tc_category.data:
                 cat_op = '$and' if (form.tc_category_op.data == "and") else '$or'
-                query = {cat_op: [{"threat_category": {"$elemMatch": {**elem_match, "id": cat}}} for cat in form.tc_category.data]}
+                query = {cat_op: [{"_threat_category": {"$elemMatch": {**elem_match, "id": cat}}} for cat in form.tc_category.data]}
             else:
-                query = {"threat_category": {"$elemMatch": elem_match}}
+                query = {"_threat_category": {"$elemMatch": elem_match}}
             queries.append(query)
     if form.node.data:
         op = '$and' if (form.node_op.data == "and") else '$or'
@@ -1140,36 +1141,7 @@ def ips():
             ip['_showable_misp_events'] = showable_misp_events
 
             # Add info about threat category
-            categories = ip.get("threat_category", [{"id": "unknown", "role": "src"}])
-            records = []
-            table_rows = []
-            total_reports = {}
-            for category in categories:
-                for module, n_reports in category.get("n_reports", {}).items():
-                    if module not in total_reports:
-                        total_reports[module] = 0
-                    if type(n_reports) is list:
-                        n_reports = len(n_reports)
-                    total_reports[module] += n_reports
-            for category in categories:
-                a = []
-                for module, n_reports in category.get("n_reports", {}).items():
-                    if type(n_reports) is list:
-                        n_reports = len(n_reports)
-                    c = n_reports / total_reports[module]
-                    a.append(c)
-                if category["id"] == "unknown" or len(a) == 0:
-                    confidence = 0.0
-                else:
-                    confidence = sum(a) / len(a)
-                records.append([category["role"], category["id"], round(confidence, 2)])
-            records_sorted = sorted(records, key=lambda rec: rec[2], reverse=True)
-            for record in records_sorted:
-                record[2] = str(record[2])
-                table_rows.append(",".join(record))
-            ip["_threat_category_table_preview"] = ";".join(table_rows)
-            ip["_threat_category_role"] = records_sorted[0][0]
-            ip["_threat_category_id"] = records_sorted[0][1]
+            ip['_threat_category_table'] = create_threat_category_table(ip.get('_threat_category_summary', []), 0.25, 10)
     else:
         results = None
         if g.user and not g.ac('ipsearch'):
@@ -1178,6 +1150,35 @@ def ips():
     return render_template('ips.html', json=json, ctrydata=ctrydata, blacklist_info=blacklist_info, **locals())
 
 
+def create_threat_category_table(category_records, min_confidence, max_subcategory_values):
+    source_names = {
+        'warden_receiver': 'Warden',
+        'misp_receiver': 'MISP',
+        'otx_receiver': 'OTX',
+        'dshield': 'DShield',
+        'blacklists': 'Blacklists'
+    }
+    table_rows = []
+    for rec in category_records:
+        if rec['confidence'] < min_confidence:
+            continue
+        category_description = threat_categorization_config.get(rec['id'], {}).get('description', f"ERROR: missing configuration for category '{rec['id']}'")
+        sources_str = ''.join([f"<li>{source_names[source]} ({n_reports})</li>" for source, n_reports in sorted(rec['sources'].items())])
+        tooltip_content = f"<b>{category_description}</b><br/><br/>Confidence: {rec['confidence']}<br/>Sources:<br/><ul>{sources_str}</ul>"
+        subcategories = list(rec['subcategories'].items())
+        if not subcategories:
+            table_rows.append([rec['role'], rec['id'], "", tooltip_content])
+        else:
+            key, values = subcategories[0]
+            subcategory_content = f"{key}: {', '.join(values)}" if len(values) <= max_subcategory_values else f"{key}: <i>many</i>"
+            table_rows.append([rec['role'], rec['id'], subcategory_content, tooltip_content])
+            for item in subcategories[1:]:
+                key, values = item
+                subcategory_content = f"{key}: {', '.join(values)}" if len(values) <= max_subcategory_values else f"{key}: <i>many</i>"
+                table_rows.append(["", "", subcategory_content, tooltip_content])
+    return table_rows
+
+
 @app.route('/_ips_count', methods=["POST"])
 def ips_count():
     log_ep.log('/ips_count')
@@ -1359,16 +1360,8 @@ def ip(ipaddr=None):
                             asn_list.append(asn)
                 ipinfo['asns'] = asn_list
 
-                threat_category_table = []
-                if 'threat_category' in ipinfo:
-                    for category in ipinfo['threat_category']:
-                        category_config = threat_categorization_config.get(category["id"], {})
-                        subcategories = category_config.get("subcategories", [])
-                        subcategory_content = f"{subcategories[0]}: {category[subcategories[0]]}" if subcategories and subcategories[0] in category else ""
-                        threat_category_table.append([category["role"], category["id"], subcategory_content])
-                        for subcategory in subcategories[1:]:
-                            if subcategory in category:
-                                threat_category_table.append(["", "", f"{subcategory}: {category[subcategory]}"])
+                # Create threat category table
+                threat_category_table = create_threat_category_table(ipinfo.get('_threat_category_summary', []), 0, 10)
 
                 # Pseudonymize node names if user is not allowed to see the original names
                 if not g.ac('nodenames'):
diff --git a/NERDweb/static/main.js b/NERDweb/static/main.js
index 4f52b374..0db9acf9 100644
--- a/NERDweb/static/main.js
+++ b/NERDweb/static/main.js
@@ -26,25 +26,6 @@ function create_event_table(data) { /* data are "dataset" field of a DOM node wi
   return content;
 }
 
-function create_threat_category_table_preview(data) {
-  if (data.tctablepreview == "") {
-    return "No categories";
-  }
-  var table = [];
-  var table_rows = data.tctablepreview.split(";");
-  for (i = 0; i < table_rows.length; i++) {
-    table.push(table_rows[i].split(","));
-  }
-  var content = "<table><tr><th>Role</th><th>Category</th><th>Confidence</th></tr>";
-  for (i = 0; i < table.length; i++) {
-    content += "<tr><td>"
-    content += table[i].join("</td><td>");
-    content += "</td></tr>";
-  }
-  content += "</table>";
-  return content;
-}
-
 $(function() {
   /* jQuery UI tooltip at:
      - country flags (with name of the country)
@@ -72,15 +53,14 @@ $(function() {
     content: function() { return create_event_table(this.dataset) }, /*$(".tooltip_event_table", this).html(); },*/
     tooltipClass: "events_tooltip"
   });
-  /* jQuery UI tooltip at "threat category" cell with confidence table */
-  $( ".threat_category" ).tooltip({
-    items: ".threat_category",
+  /* jQuery UI tooltip at "threat category" cell with info about each category */
+  $( ".threat_category_tooltip" ).tooltip({
+    items: ".threat_category_tooltip",
     track: false,
     show: false,
     hide: false,
     position: {my: "left bottom", at: "left-7px top-2px", collision: "flipfit"},
-    content: function() { return create_threat_category_table_preview(this.dataset) },
-    tooltipClass: "threat_category_tooltip"
+    content: function() { return $(this).attr('title') },
   });
   /* jQuery UI tooltip at times with "timeago" */
   $( ".time" ).tooltip({
diff --git a/NERDweb/static/style.css b/NERDweb/static/style.css
index c7a3eeb7..147d6b65 100644
--- a/NERDweb/static/style.css
+++ b/NERDweb/static/style.css
@@ -484,50 +484,36 @@ td.country a {
   color: inherit;
 }
 
-td.threat_category {
+.threat_category_preview table {
   text-align: left;
+  border: hidden;
 }
-td.threat_category span {
-  display: inline-block;
+.threat_category_preview table td {
   min-width: 1.3em;
   padding: 0 0.2em;
+  border-width: 0px 0px 0px 0px;
 }
-td.threat_category span + span {
+.threat_category_preview table td + td {
   color: #222;
   border-left: 2px dotted #888;
   padding: 0 0.5em;
 }
 
-.threat_category_tooltip table {
-  border: solid #000;
-  border-width: 0px 0px 1px 1px;
-  border-collapse: separate;
-  border-spacing: 0px;
-  margin: 0.4em 0;
-}
-.threat_category_tooltip table td,
-.threat_category_tooltip table th {
-  border: solid #000;
-  border-width: 1px 1px 0px 0px;
-  padding: 0.1em 0.4em;
-}
-.threat_category_tooltip table td {
+.threat_category_detail table {
   text-align: left;
+  border: solid #000;
+  border-width: 1px 1px 1px 1px;
 }
-.threat_category_tooltip table th {
-  text-align: center;
-}
-
-.threat_category_table table td,
-.threat_category_table table th {
+.threat_category_detail table td,
+.threat_category_detail table th {
   border: solid #000;
   border-width: 1px 1px 1px 1px;
   padding: 0.1em 0.4em;
 }
-.threat_category_table table td {
+.threat_category_detail table td {
   text-align: left;
 }
-.threat_category_table table th {
+.threat_category_detail table th {
   text-align: center;
 }
 
diff --git a/NERDweb/templates/ip.html b/NERDweb/templates/ip.html
index 2d18039c..0d554fb6 100644
--- a/NERDweb/templates/ip.html
+++ b/NERDweb/templates/ip.html
@@ -282,20 +282,20 @@ <h1>IP address</h1>
 {% endfor %}
 </dl>
 
-{# Threat category table #}
+{# Threat category summary #}
 {% if threat_category_table %}
 <p class="caption">Threat category</p>
-<div class="threat_category_table">
-<table>
-    <tr><th>Role</th><th>Category</th><th>Subcategory</th></tr>
-    {% for row in threat_category_table %}
-        <tr>
-        {% for col in row %}
-            <td>{{ col }}</td>
+<div class="threat_category_detail">
+    <table>
+        <tr><th>Role</th><th>Category</th><th>Subcategory</th></tr>
+        {% for row in threat_category_table %}
+            <tr class="threat_category_tooltip" title="{{ row[3] }}">
+            {% for col in row[:3] %}
+                <td>{{ col|safe }}</td>
+            {% endfor %}
+            </tr>
         {% endfor %}
-        </tr>
-    {% endfor %}
-</table>
+    </table>
 </div>
 {% endif %}
 
diff --git a/NERDweb/templates/ips.html b/NERDweb/templates/ips.html
index 37ecd5d6..ca8566e7 100644
--- a/NERDweb/templates/ips.html
+++ b/NERDweb/templates/ips.html
@@ -407,9 +407,20 @@ <h1>Search IP addresses by ...</h1>
       {% endfor %}
     {% endif %}
   </td>
-  <td class="threat_category" data-tctablepreview="{{ip._threat_category_table_preview}}">
-    <span>{{ip._threat_category_role}}</span>
-    <span>{{ip._threat_category_id}}</span>
+  <td>
+    <div class="threat_category_preview">
+      {% if ip._threat_category_table %}
+        <table>
+        {% for row in ip._threat_category_table %}
+            <tr class="threat_category_tooltip" title="{{ row[3] }}">
+            {% for col in row[:3] %}
+                <td>{{ col|safe }}</td>
+            {% endfor %}
+            </tr>
+        {% endfor %}
+        </table>
+      {% endif %}
+    </div>
   </td>
   <td class="time" {% if ip.ts_added %}data-time={{ ip.ts_added|date_to_int }}{% endif %}>{{ip.ts_added.strftime("%Y-%m-%d %H:%M:%S") if ip.ts_added else "--"}}</td>
   <td {% if ip.last_activity %}class="time" data-time={{ ip.last_activity|date_to_int }}{% endif %}>{{ip.last_activity.strftime("%Y-%m-%d %H:%M:%S") if ip.last_activity else "--"}}</td>
diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index a19d00ef..e0c14110 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -2,11 +2,13 @@
 import ast
 from datetime import datetime
 
+from .utils import parse_rfc_time
+
 
 class ClassifiableEvent:
     def __getattr__(self, name):
         """
-        Override __getattr__ so that no error is raised when a module asks for a non-existing attribute
+        Override __getattr__ so that no error is raised when a trigger tries to use non-existing attribute
         :param name: Name of the attribute
         :return: Value of the attribute (or None if it does not exist)
         """
@@ -24,7 +26,7 @@ def __init__(self, module_name=None, *args):
         """
         Initialize the event (fill metadata from source module)
         :param module_name: Name of the attribute
-        :param *args: Module specific attributes (such as a list of protocols from Warden)
+        :param *args: Module specific attributes
         :return:
         """
         init_fn = getattr(self, f"init_{module_name}")
@@ -36,15 +38,17 @@ def init_warden_receiver(self, event, source):
         :param event: Source event
         :return:
         """
+        detect_time = parse_rfc_time(event["DetectTime"])
+        self.date = detect_time.strftime("%Y-%m-%d")
         self.categories = event.get('Category', [])
-        self.source_types = source.get('Type', [])
+        self.ip_info = ";".join(source.get('Type', []))
         self.description = event.get("Description", "")
         target_ports = []
         protocols = source.get('Proto', [])
         for target in event.get('Target', []):
             target_ports += target.get('Port', [])
             protocols += target.get('Proto', [])
-        self.target_ports = list(set(target_ports))
+        self.target_ports = [str(port) for port in set(target_ports)]
         self.protocols = list(set(protocols))
 
     def init_otx_receiver(self, pulse):
@@ -53,9 +57,12 @@ def init_otx_receiver(self, pulse):
         :param pulse: Source pulse
         :return:
         """
-        self.indicator_role = pulse.get('indicator_role', "")
-        self.indicator_title = pulse.get('indicator_title', "")
-        self.pulse_name = pulse.get('pulse_name', "")
+        self.date = datetime.strftime(pulse.get('pulse_created', datetime.now()), "%Y-%m-%d")
+        self.indicator_role = str(pulse.get('indicator_role', None))
+        self.ip_info = str(pulse.get('indicator_title', None))
+        self.description = str(pulse.get('pulse_name', None))
+        self.protocols = []
+        self.target_ports = []
 
     def init_misp_receiver(self, event, attrib, ip_role):
         """
@@ -65,10 +72,13 @@ def init_misp_receiver(self, event, attrib, ip_role):
         :param ip_role: Role of the IP address (src/dst/both)
         :return:
         """
+        self.date = datetime.strftime(attrib.get('date', datetime.now()), "%Y-%m-%d")
         self.tags = [tag["name"] for tag in event.get('tag_list', [])]
-        self.info = event.get('info', "")
-        self.attrib_comment = attrib.get('comment', "")
+        self.description = event.get('info', "")
+        self.ip_info = attrib.get('comment', "")
         self.ip_role = ip_role
+        self.protocols = []
+        self.target_ports = []
         try:
             if attrib['type'] == "ip-dst|port":
                 split_attrib = attrib['value'].split('|')
@@ -80,20 +90,74 @@ def init_misp_receiver(self, event, attrib, ip_role):
             pass
 
 
-def eval_trigger(trigger, event):
+def classify_ip(ip_addr, module_name, logger, config, *args):
+    """
+    Assign a threat category based on the information provided in the incoming event
+
+    :return: List of assigned categories
+    """
+    try:
+        output = []
+        event = ClassifiableEvent(module_name, *args)
+        for category_id, category_params in config["categories"].items():
+            category_triggers = category_params.get("triggers", {}).get("general", "False").split("\n") + \
+                                category_params.get("triggers", {}).get(module_name, "False").split("\n")
+            for trigger in category_triggers:
+                result, subcategories = eval_trigger(trigger, event, category_params, config, logger)
+                if result is True:
+                    output.append({
+                        "date": event.date,
+                        "id": category_id,
+                        "role": category_params["role"],
+                        "subcategories": subcategories
+                    })
+                    break
+    except Exception as e:
+        logger.error(f"Error in threat category classification for IP {ip_addr}: {e}")
+    if not output:
+        output.append({"date": event.date, "id": "unknown", "role": "src", "subcategories": {}})
+        with open(f"/var/log/nerd/threat_categorization_unknown.log", "a+") as logfile:
+           logfile.write(f"[{datetime.now()}] MODULE: {module_name} IP: {ip_addr} EVENT-INFO: {event}\n")
+    logger.debug(f"Threat category classification for {ip_addr}: {output}; Event info: {event}")
+    return output
+
+
+def eval_trigger(trigger, event, category_params, config, logger):
     """
-    Evaluate a category trigger, i.e. a statement that resolves to either True or False
+    Evaluate a category trigger
     :param trigger: Trigger to be evaluated
     :param event: Source event (instance of ClassifiableEvent) from which the trigger reads data
+    :param category_params: Category parameters (e.g. list of subcategories)
+    :param logger: Source module logger
     :return: Result of the evaluation (True/False), dictionary with subcategory assignments
     """
     result = False
-    subcategories = {}
-    a = trigger.split("->")
-    if eval(a[0]) is True:
-        result = True
-    if len(a) > 1:
-        subcategories = ast.literal_eval(a[1].lstrip())
+    required_subcategories = category_params.get("subcategories", [])
+    subcategories = {s: [] for s in required_subcategories}
+
+    try:
+        split_trigger = trigger.split("->")
+        if eval(split_trigger[0]) is True:
+            result = True
+        if len(split_trigger) > 1:
+            subcategories.update(ast.literal_eval(split_trigger[1].lstrip()))
+    except Exception as e:
+        logger.error(f"Error when evaluating category trigger ({trigger}): {e}")
+        logger.error(f"Event info: {event}")
+
+    if result is True:
+        if "port" in required_subcategories:
+            subcategories["port"] += event.target_ports
+            subcategories["port"] = list(set(subcategories["port"]))
+        if "protocol" in required_subcategories:
+            subcategories["protocol"] += event.protocols
+            subcategories["protocol"] = list(set(subcategories["protocol"]))
+        if "malware_family" in required_subcategories:
+            text = f"{event.description};{event.ip_info}"
+            for family_id, family_data in config["malware_families"].items():
+                if match_str(family_data["common_name"], text):
+                    subcategories["malware_family"].append(family_id.lower())
+            subcategories["malware_family"] = list(set(subcategories["malware_family"]))
     return result, subcategories
 
 
@@ -103,53 +167,6 @@ def match_str(str_a, str_b):
 
     Ignores character casing, whitespace and some special characters
     """
-    simplified_a = str_a.strip().replace("_", "").replace(".", "").lower()
-    simplified_b = str_b.strip().replace("_", "").replace(".", "").lower()
+    simplified_a = str_a.strip().replace("_", "").replace(".", "").replace("-", "").lower()
+    simplified_b = str_b.strip().replace("_", "").replace(".", "").replace("-", "").lower()
     return simplified_a in simplified_b
-
-
-def log_category(id, module, category, event):
-    """
-    Log assigned category
-    :param id: ID of the record (e.g. IP address or blacklist name)
-    :param module: Name of the source module
-    :param category: Assigned category
-    :param event: Source event (instance of ClassifiableEvent)
-    :return:
-    """
-    with open(f"/var/log/nerd/threat_categorization_{module}.log", "a+") as logfile:
-        logfile.write(f"{datetime.now()}\n")
-        logfile.write(f"ID: {id}\n")
-        logfile.write(f"Category: {category}\n")
-        logfile.write(f"Event: {event}\n")
-        logfile.write("===============================================\n")
-
-
-def load_malware_families():
-    """
-    Load the list of malware families downloaded from Malpedia
-    :return:
-    """
-    try:
-        with open("/data/malpedia/malware_families.yml", "r") as f:
-            return yaml.safe_load(f)
-    except Exception:
-        return {}
-
-
-def load_categorization_config(module_name=None):
-    """
-    Load categorization configuration for a specific module
-    :param module_name: Name of the source module
-    :return: Dictionary containing categorization config
-    """
-    categories = {}
-    categorization_config = yaml.safe_load(open("/etc/nerd/threat_categorization.yml"))
-    for category_id, category_config in categorization_config.items():
-        categories[category_id] = {
-            "label": category_config.get("label", ""),
-            "role": category_config.get("role", "src"),
-            "subcategories": category_config.get("subcategories", []),
-            "triggers": category_config.get("triggers", {}).get(module_name, "False").split("\n")
-        }
-    return categories
diff --git a/etc/threat_categorization.yml b/etc/threat_categorization.yml
index 700e2380..b22e5746 100644
--- a/etc/threat_categorization.yml
+++ b/etc/threat_categorization.yml
@@ -1,220 +1,258 @@
+# Path to YAML file with Malpedia's list of malware families
+# Used for malware subcategory classification
+malpedia_family_list_path: "/data/malpedia/malware_families.yml"
 
-unknown:
-  role: src
-  description: The IP was reported as a source of malicious/unexpected/rouge packets, but without any further specification.
-  label: Unknown
 
-scan:
-  role: src
-  description: The IP address performs a common network scanning, i.e. it tries to connect to various targets to search for open ports/services.
-  label: Scanning
-  subcategories:
-    - port
-  triggers:
-    warden_receiver: |-
-      'Recon.Scanning' in event.categories
-    otx_receiver: |-
-      event.indicator_role == 'scanning_host'
-      event.pulse_name == 'Webscanners 2018-02-09 thru current day'
-    misp_receiver: |-
-      'CERT-XLM:information-gathering="scanner"' in event.tags
-      'ecsirt:information-gathering="scanner"' in event.tags
-      'circl:incident-classification="scan"' in event.tags
+# Threat categorization
+# Structure:
+#   dict{category ID -> category parameters}
+#   Category parameters:
+#   - role: IP role (src/dst) that will be assigned along with the main category
+#   - label: Displayed name of the category
+#   - description: General description of the category
+#   - subcategories: List of required subcategories (port, protocol, malware_family)
+#           Supported subcategories:
+#           - port
+#           - protocol
+#           - malware_family
+#
+#   - triggers: List of category triggers (single string separated by newlines)
+#               Divided to sections for each source module so that they do not have to evaluate unnecessary triggers
+#               IP is assigned a category if a related trigger is evaluated as True
+#
+#           Supported syntax:
+#           - triggers can have two parts separated by '->', both use standard Python syntax
+#           - first part is mandatory and should resolve to either True or False
+#           - second part is optional and contains a dictionary definition, which is used to specify subcategories
+#
+#           Evaluation:
+#           - triggers are evaluated by source modules when a new event is received (except for blacklists module
+#             which instead uses it as a blacklist ID for blacklist -> category mapping)
+#           - IP is assigned a category if any of the related statements resolve to True
+#           - if required by the category configuration, the IP is also assigned subcategories based on the second
+#             part of the statement (can be empty)
+#           - within each statement it is possible to access an 'event' object (instance of ClassifiableEvent),
+#             which represents the event that is currently being classified.
+#           - event properties:
+#             - date: Date of the event (YY-MM-DD)
+#             - description: Event description (string)
+#             - ip_info: Additional info about the IP (string, e.g. attribute comment from MISP)
+#             - categories/tags/indicator_role: List of event categories/tags/indicators that can be used for classification
+#             - protocols: List of protocols used by the IP
+#             - target_ports: List of target ports used by the IP
 
-bruteforce:
-  role: src
-  description: The IP performs dictionary (or bruteforce) attacks on password-protected services. Usually accompanied with scanning - searching for the targeted service.
-  label: Bruteforce
-  subcategories:
-    - protocol
-    - port
-  triggers:
-    warden_receiver: |-
-      'Attempt.Login' in event.categories
-      'Intrusion.UserCompromise' in event.categories
-      'SSH login' in event.description -> {'protocol': ['ssh']}
-    otx_receiver: |-
-      event.indicator_role == 'bruteforce'
-      'Telnet Login attempt' in event.indicator_title -> {'protocol': ['telnet']}
-      'Telnet honeypot logs' in event.pulse_name -> {'protocol': ['telnet']}
-      'SSH honeypot logs' in event.pulse_name -> {'protocol': ['ssh']}
-      'RDP honeypot logs' in event.pulse_name -> {'protocol': ['rdp']}
-      'VNC honeypot logs' in event.pulse_name
-      'Redis honeypot logs' in event.pulse_name
-      'PostgresQL honeypot logs' in event.pulse_name
-      'SSH intrusion attempt' in event.indicator_title -> {'protocol': ['ssh']}
-      'RDP intrusion attempt' in event.indicator_title -> {'protocol': ['rdp']}
-    misp_receiver: |-
-      'CERT-XLM:intrusion-attempts="login-attempts"' in event.tags
-      'ecsirt:intrusion-attempts="brute-force"' in event.tags
-    blacklists: |-
-      bruteforceblocker
-      blocklist_de-ssh -> {'protocol': ['ssh']}
-      blocklist_de-bruteforcelogin
-      charles_the_haleys_ssh_dico_ips -> {'protocol': ['ssh']}
-      charles_the_haleys_smtp_dico_ips -> {'protocol': ['smtp']}
-      dataplane_org_sshclient -> {'protocol': ['ssh']}
-      dataplane_org_sshpwauth -> {'protocol': ['ssh']}
-      dataplane_org_telnet_login -> {'protocol': ['telnet']}
+threat_categorization:
+  unknown:
+    role: src
+    description: The IP was reported as a source of malicious/unexpected/rouge packets, but without any further specification.
+    label: Unknown
 
-ddos:
-  role: src
-  description: The IP has been observed as a source of volumetric (D)DoS attacks.
-  label: DDoS
-  triggers:
-    warden_receiver: |-
-      'DoS anomalies' in event.description
-      'Availability.DoS' in event.categories
-      'Availability.DDoS' in event.categories
-    misp_receiver: |-
-      'DDoS' in event.tags
-      'CERT-XLM:availability="dos"' in event.tags
-      'CERT-XLM:availability="ddos"' in event.tags
-      'ecsirt:availability="dos"' in event.tags
-      'ecsirt:availability="ddos"' in event.tags
-      'circl:incident-classification="denial-of-service"' in event.tags
+  scan:
+    role: src
+    description: The IP address performs a common network scanning, i.e. it tries to connect to various targets to search for open ports/services.
+    label: Scanning
+    subcategories:
+      - port
+    triggers:
+      general: |-
+        match_str('scan', event.ip_info)
+      warden_receiver: |-
+        any([match_str('Recon', cat) for cat in event.categories])
+      otx_receiver: |-
+        match_str('scan', event.indicator_role)
+        event.description == 'Webscanners 2018-02-09 thru current day'
+      misp_receiver: |-
+        'CERT-XLM:information-gathering="scanner"' in event.tags
+        'ecsirt:information-gathering="scanner"' in event.tags
+        'circl:incident-classification="scan"' in event.tags
 
-ddos-amplifier:
-  role: dst
-  description: The IP runs a service which can be (and often is) misused as an amplifier for DDoS attacks, e.g. open DNS resolvers, NTP servers, memcached, etc.
-  label: DDoS amplifier
-  subcategories:
-    - protocol
-  triggers:
-    warden_receiver: |-
-      'Vulnerable.Config' in event.categories and 'dns' in event.protocols
-      'Vulnerable.Config' in event.categories and 'ntp' in event.protocols
-      'Vulnerable.Config' in event.categories and 'memcached' in event.protocols
-      'Backscatter' in event.source_types
-      'Open DNS Resolver' in event.description -> {'protocol': ['dns']}
-      'Open Memcached' in event.description -> {'protocol': ['memcached']}
-      'Abusable NTP' in event.description -> {'protocol': ['ntp']}
+  bruteforce:
+    role: src
+    description: The IP performs dictionary (or bruteforce) attacks on password-protected services. Usually accompanied with scanning - searching for the targeted service.
+    label: Bruteforce
+    subcategories:
+      - protocol
+      - port
+    triggers:
+      general: |-
+        match_str('SSH login', (event.ip_info + event.description)) -> {'protocol': ['ssh']}
+        match_str('SSH intrusion', (event.ip_info + event.description)) -> {'protocol': ['ssh']}
+        match_str('SSH honeypot', (event.ip_info + event.description)) -> {'protocol': ['ssh']}
+        match_str('RDP honeypot', (event.ip_info + event.description)) -> {'protocol': ['rdp']}
+        match_str('Telnet login', (event.ip_info + event.description)) -> {'protocol': ['telnet']}
+        match_str('Telnet honeypot', (event.ip_info + event.description)) -> {'protocol': ['telnet']}
+        match_str('bruteforce', event.ip_info)
+      warden_receiver: |-
+        'Attempt.Login' in event.categories
+        'Intrusion.UserCompromise' in event.categories
+        'Intrusion.AdminCompromise' in event.categories
+      otx_receiver: |-
+        match_str('bruteforce', event.indicator_role)
+        'VNC honeypot logs' in event.description
+        'Redis honeypot logs' in event.description
+        'PostgresQL honeypot logs' in event.description
+      misp_receiver: |-
+        'CERT-XLM:intrusion-attempts="login-attempts"' in event.tags
+        'ecsirt:intrusion-attempts="brute-force"' in event.tags
+      blacklists: |-
+        blocklist_de-ssh -> {'protocol': ['ssh']}
+        charles_the_haleys_ssh_dico_ips -> {'protocol': ['ssh']}
+        charles_the_haleys_smtp_dico_ips -> {'protocol': ['smtp']}
+        dataplane_org_sshclient -> {'protocol': ['ssh']}
+        dataplane_org_sshpwauth -> {'protocol': ['ssh']}
+        dataplane_org_telnet_login -> {'protocol': ['telnet']}
+        bruteforceblocker
+        blocklist_de-bruteforcelogin
 
-spam:
-  role: src
-  description: The IP is sending spam.
-  label: Spam
-  triggers:
-    warden_receiver: |-
-      'Abusive.Spam' in event.categories
-      'OriginSpam' in event.source_types
-      'Spam' in event.source_types
-    misp_receiver: |-
-      'CERT-XLM:abusive-content="spam"' in event.tags
-      'ecsirt:abusive-content="spam"' in event.tags
-      'circl:incident-classification="spam"' in event.tags
-    blacklists: |-
-      sblam_ips
-      psbl
-      spamhaus_edrop
+  ddos:
+    role: src
+    description: The IP has been observed as a source of volumetric (D)DoS attacks.
+    label: DDoS
+    triggers:
+      warden_receiver: |-
+        'DoS anomalies' in event.description
+        'Availability.DoS' in event.categories
+        'Availability.DDoS' in event.categories
+      misp_receiver: |-
+        'DDoS' in event.tags
+        'CERT-XLM:availability="dos"' in event.tags
+        'CERT-XLM:availability="ddos"' in event.tags
+        'ecsirt:availability="dos"' in event.tags
+        'ecsirt:availability="ddos"' in event.tags
+        'circl:incident-classification="denial-of-service"' in event.tags
 
-malware_distribution:
-  role: dst
-  description: The IP is used to distribute a malware, e.g. hosts an HTTP URL from which a malware is being downloaded.
-  label: Malware distribution
-  subcategories:
-    - malware_family
-  triggers:
-    warden_receiver: |-
-      'Malware' in event.source_types
-      'Malware' in event.categories
-      'Malware.Virus' in event.categories
-      'Malware.Worm' in event.categories
-      'Malware.Trojan' in event.categories
-      'Malware.Spyware' in event.categories
-      'Malware.Dialer' in event.categories
-      'Malware.Rootkit' in event.categories
-      'OriginMalware' in event.source_types
-    otx_receiver: |-
-      event.indicator_role == 'trojan'
-      event.indicator_role == 'malware_hosting'
-    misp_receiver: |-
-      'Malware download' in event.attrib_comment
-      'payload_delivery' in event.attrib_comment
-      'MALWARE' in event.tags and 'dst' in event.ip_role
-      'Keylogger' in event.tags and 'dst' in event.ip_role
-      'infostealer' in event.tags and 'dst' in event.ip_role
-      'Ransomware' in event.tags and 'dst' in event.ip_role
-      'Remote Access Trojan' in event.tags and 'dst' in event.ip_role
-      'MalSpam' in event.tags and 'dst' in event.ip_role
-      'circl:incident-classification="malware"' in event.tags and 'dst' in event.ip_role
-      'ecsirt:malicious-code="malware"' in event.tags and 'dst' in event.ip_role
-    blacklists: |-
-      urlhouse_ips
+  ddos-amplifier:
+    role: dst
+    description: The IP runs a service which can be (and often is) misused as an amplifier for DDoS attacks, e.g. open DNS resolvers, NTP servers, memcached, etc.
+    label: DDoS amplifier
+    subcategories:
+      - protocol
+    triggers:
+      general: |-
+        match_str('Open DNS', (event.ip_info + event.description)) -> {'protocol': ['dns']}
+        match_str('Open Memcached', (event.ip_info + event.description)) -> {'protocol': ['memcached']}
+        match_str('Abusable NTP', (event.ip_info + event.description)) -> {'protocol': ['ntp']}
+      warden_receiver: |-
+        'Vulnerable.Config' in event.categories and 'dns' in event.protocols -> {'protocol': ['dns']}
+        'Vulnerable.Config' in event.categories and 'ntp' in event.protocols -> {'protocol': ['ntp']}
+        'Vulnerable.Config' in event.categories and 'memcached' in event.protocols -> {'protocol': ['memcached']}
+        'Backscatter' in event.ip_info and 'dns' in event.protocols -> {'protocol': ['dns']}
+        'Backscatter' in event.ip_info and 'ntp' in event.protocols -> {'protocol': ['ntp']}
+        'Backscatter' in event.ip_info and 'memcached' in event.protocols -> {'protocol': ['memcached']}
 
-cc:
-  role: dst
-  description: The IP is used as Command&Control server for a botnet/malware.
-  label: Command and control
-  subcategories:
-    - malware_family
-  triggers:
-    warden_receiver: |-
-      'CC' in event.source_types
-    otx_receiver: |-
-      event.indicator_role == 'command_and_control'
-      'Command and Control' in event.indicator_title
-    misp_receiver: |-
-      'botnet_cc' in event.attrib_comment
-      'C2 server' in event.attrib_comment
-      'kill-chain:Command and Control' in event.tags and 'dst' in event.ip_role
-      'ecsirt:malicious-code="c&c"' in event.tags and 'dst' in event.ip_role
-    blacklists: |-
-      feodo
-      bambenek_c2
+  spam:
+    role: src
+    description: The IP is sending spam.
+    label: Spam
+    triggers:
+      general: |-
+        match_str('spam', event.ip_info)
+      warden_receiver: |-
+        'Abusive.Spam' in event.categories
+      misp_receiver: |-
+        'CERT-XLM:abusive-content="spam"' in event.tags
+        'ecsirt:abusive-content="spam"' in event.tags
+        'circl:incident-classification="spam"' in event.tags
+      blacklists: |-
+        sblam_ips
+        psbl
+        spamhaus_edrop
 
-botnet_drone:
-  role: src
-  description: The IP is acting as a bot/drone of a botnet.
-  label: Botnet drone
-  subcategories:
-    - malware_family
-  triggers:
-    warden_receiver: |-
-      'Intrusion.Botnet' in event.categories
-      'Botnet' in event.source_types
-    misp_receiver: |-
-      'CERT-XLM:intrusion="botnet-member"' in event.tags
-      'ecsirt:malicious-code="botnet-drone"' in event.tags
-    blacklists: |-
-      mirai_tracker_ips -> {'malware_family': ['elf.mirai']}
+  malware_distribution:
+    role: dst
+    description: The IP is used to distribute a malware, e.g. hosts an HTTP URL from which a malware is being downloaded.
+    label: Malware distribution
+    subcategories:
+      - malware_family
+    triggers:
+      general: |-
+        match_str('malware', event.ip_info)
+        match_str('trojan', event.ip_info)
+        match_str('ransomware', event.ip_info)
+        match_str('payload delivery', event.ip_info)
+      warden_receiver: |-
+        any([match_str('Malware', cat) for cat in event.categories])
+      otx_receiver: |-
+        match_str('malware', event.indicator_role)
+        match_str('trojan', event.indicator_role)
+      misp_receiver: |-
+        any([match_str('malware', tag) for tag in event.tags])
+        any([match_str('ransomware', tag) for tag in event.tags])
+        any([match_str('trojan', tag) for tag in event.tags])
+        'circl:incident-classification="malware"' in event.tags
+        'ecsirt:malicious-code="malware"' in event.tags
+      blacklists: |-
+        urlhouse_ips
 
-phishing_site:
-  role: dst
-  description: The IP is hosting a phishing website.
-  label: Phishing site
-  triggers:
-    warden_receiver: |-
-      'Fraud.Phishing' in event.categories
-      'Phishing' in event.source_types
-    misp_receiver: |-
-      'Phishing' in event.tags
-      'Phishing Site' in event.tags
-      'CERT-XLM:fraud="phishing"' in event.tags
-      'ecsirt:fraud="phishing"' in event.tags
-      'circl:incident-classification="phishing"' in event.tags
-      'circl:incident-classification="whaling"' in event.tags
-      'circl:incident-classification="smishing"' in event.tags
-    blacklists: |-
-      openphish
+  cc:
+    role: dst
+    description: The IP is used as Command&Control server for a botnet/malware.
+    label: Command and control
+    subcategories:
+      - malware_family
+    triggers:
+      general: |-
+        match_str('command and control', event.ip_info)
+        match_str('botnet cc', event.ip_info)
+        match_str('c2 server', event.ip_info)
+      warden_receiver: |-
+        'CC' in event.ip_info
+      otx_receiver: |-
+        match_str('command and control', event.indicator_role)
+      misp_receiver: |-
+        'C2' in event.tags
+        'kill-chain:Command and Control' in event.tags
+        'ecsirt:malicious-code="c&c"' in event.tags
+      blacklists: |-
+        feodo
+        bambenek_c2
 
-exploit:
-  role: src
-  description: The IP is attempting to exploit known vulnerabilities.
-  label: Exploit
-  subcategories:
-    - protocol
-  triggers:
-    warden_receiver: |-
-      'Attempt.Exploit' in event.categories
-    otx_receiver: |-
-      'Apache honeypot logs' in event.pulse_name -> {'protocol': ['http']}
-      event.indicator_role == 'exploit_source'
-      event.indicator_role == 'exploit_kit'
-    misp_receiver: |-
-      'Exploit Kit' in event.tags
-      'CERT-XLM:intrusion-attempts="exploit-known-vuln"' in event.tags
-      'CERT-XLM:intrusion-attempts="new-attack-signature"' in event.tags
-      'ecsirt:intrusion-attempts="exploit"' in event.tags
-      'circl:incident-classification="XSS"' in event.tags
-      'circl:incident-classification="sql-injection"' in event.tags
+  botnet_drone:
+    role: src
+    description: The IP is acting as a bot/drone of a botnet.
+    label: Botnet drone
+    subcategories:
+      - malware_family
+    triggers:
+      warden_receiver: |-
+        'Intrusion.Botnet' in event.categories
+        'Botnet' in event.ip_info
+      misp_receiver: |-
+        'CERT-XLM:intrusion="botnet-member"' in event.tags
+        'ecsirt:malicious-code="botnet-drone"' in event.tags
+      blacklists: |-
+        mirai_tracker_ips -> {'malware_family': ['elf.mirai']}
+
+  phishing_site:
+    role: dst
+    description: The IP is hosting a phishing website.
+    label: Phishing site
+    triggers:
+      general: |-
+        match_str('phishing', event.ip_info)
+      warden_receiver: |-
+        'Fraud.Phishing' in event.categories
+      misp_receiver: |-
+        any([match_str('phishing', tag) for tag in event.tags])
+      blacklists: |-
+        openphish
+
+  exploit:
+    role: src
+    description: The IP is attempting to exploit known vulnerabilities.
+    label: Exploit
+    subcategories:
+      - protocol
+    triggers:
+      general: |-
+        match_str('exploit', event.ip_info)
+      warden_receiver: |-
+        'Attempt.Exploit' in event.categories
+      otx_receiver: |-
+        'Apache honeypot logs' in event.description -> {'protocol': ['http']}
+        match_str('exploit', event.indicator_role)
+      misp_receiver: |-
+        any([match_str('exploit', tag) for tag in event.tags])
+        'CERT-XLM:intrusion-attempts="new-attack-signature"' in event.tags
+        'circl:incident-classification="XSS"' in event.tags
+        'circl:incident-classification="sql-injection"' in event.tags
diff --git a/scripts/misp_updater.py b/scripts/misp_updater.py
index fb0eec68..8a31767c 100755
--- a/scripts/misp_updater.py
+++ b/scripts/misp_updater.py
@@ -20,6 +20,7 @@
 from common.config import read_config
 import NERDd.core.mongodb as mongodb
 from common.task_queue import TaskQueueWriter
+from common.threat_categorization import *
 
 DEFAULT_MONGO_HOST = 'localhost'
 DEFAULT_MONGO_PORT = 27017
@@ -64,6 +65,15 @@
 logger.info("Loading config file {}".format(common_cfg_file))
 config.update(read_config(common_cfg_file))
 
+# Read categorization config
+categorization_cfg_file = os.path.join(config_base_path, 'threat_categorization.yml')
+logger.info("Loading config file {}".format(categorization_cfg_file))
+config.update(read_config(categorization_cfg_file))
+categorization_config = {
+    "categories": config.get('threat_categorization'),
+    "malware_families": read_config(config.get('malpedia_family_list_path'))
+}
+
 inactive_ip_lifetime = config.get('record_life_length.misp', 180)
 
 db = mongodb.MongoEntityDatabase(config)
@@ -330,6 +340,7 @@ def process_ip(ip_addr, ip_info):
     :return: None
     """
     logger.debug("Processing IP: {}".format(ip_addr))
+    update_requests = []
 
     # check ip record in DB
     try:
@@ -358,6 +369,9 @@ def process_ip(ip_addr, ip_info):
                         dedup_event['role'] = "src and dst at the same time"
                         event_ids_roles[index][1] = "src and dst at the same time"
 
+    # aggregated threat category classification records
+    threat_category = {}
+
     # create all misp events and save the youngest datetime of the event for keep alive token
     events = []
     youngest_date = datetime(year=2000, month=1, day=1, hour=0, minute=0, second=0)
@@ -368,18 +382,56 @@ def process_ip(ip_addr, ip_info):
         if youngest_date < new_event['date']:
             youngest_date = new_event['date']
 
+        attrib = {'type': '', 'value': '', 'comment': ''}  # TODO get attrib info from misp
+        ip_role = event_info.get('role', "")
+        for category in classify_ip(ip_addr, "misp_receiver", logger, categorization_config, new_event, attrib, ip_role):
+            role = category['role']
+            id = category['id']
+            date = category['date']
+            subcategories = category['subcategories']
+            if role not in threat_category:
+                threat_category[role] = {}
+            if id not in threat_category[role]:
+                threat_category[role][id] = {}
+            if date not in threat_category[role][id]:
+                threat_category[role][id][date] = {}
+            if 'n_reports' not in threat_category[role][id][date]:
+                threat_category[role][id][date]['n_reports'] = 0
+            if 'subcategories' not in threat_category[role][id][date]:
+                threat_category[role][id][date]['subcategories'] = {}
+            for subcategory in subcategories:
+                old = threat_category[role][id][date]['subcategories'].get(subcategory, [])
+                new = subcategories[subcategory]
+                threat_category[role][id][date]['subcategories'][subcategory] = list(set(old + new))
+            threat_category[role][id][date]['n_reports'] += 1
+
+    # threat category updates
+    for role in threat_category:
+        for id in threat_category[role]:
+            for date in threat_category[role][id]:
+                n_reports = threat_category[role][id][date]['n_reports']
+                subcategory_updates = []
+                for subcategory, values in threat_category[role][id][date]['subcategories'].items():
+                    subcategory_updates.append(('extend_set', subcategory, values))
+                update_requests += [(
+                    'array_upsert',
+                    '_threat_category',
+                    {'date': date, 'id': id, 'role': role},
+                    [('add', 'n_reports.misp_receiver', n_reports), *subcategory_updates]
+                )]
+
     if events:
         live_till = youngest_date + timedelta(days=inactive_ip_lifetime)
         if db_entity is not None:
             # compare 'misp_events' attrib from NERD with events list, if not same --> insert, else do not insert
             if db_entity.get('misp_events', {}) != events:
                 # construct new update request and send it
-                update_requests = [('set', 'misp_events', events), ('set', '_ttl.misp', live_till),
+                update_requests += [('set', 'misp_events', events), ('set', '_ttl.misp', live_till),
                                    ('setmax', 'last_activity', youngest_date)]
                 tq.put_task('ip', ip_addr, update_requests, "misp_updater")
         else:
             # ip address not even in NERD --> insert it
-            update_requests = [('set', 'misp_events', events), ('set', '_ttl.misp', live_till), ('setmax',
+            update_requests += [('set', 'misp_events', events), ('set', '_ttl.misp', live_till), ('setmax',
                                                                 'last_activity', youngest_date)]
             tq.put_task('ip', ip_addr, update_requests, "misp_updater")
 

From fa4976b050191b8995f73e360c27d40fcece2d6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 11 Jan 2024 09:00:07 +0100
Subject: [PATCH 04/21] threat_categorization - use 'pulse_modified' date
 (instead of 'pulse_created') for otx records

---
 common/threat_categorization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index e0c14110..0d05ef8d 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -57,7 +57,7 @@ def init_otx_receiver(self, pulse):
         :param pulse: Source pulse
         :return:
         """
-        self.date = datetime.strftime(pulse.get('pulse_created', datetime.now()), "%Y-%m-%d")
+        self.date = datetime.strftime(pulse.get('pulse_modified', datetime.now()), "%Y-%m-%d")
         self.indicator_role = str(pulse.get('indicator_role', None))
         self.ip_info = str(pulse.get('indicator_title', None))
         self.description = str(pulse.get('pulse_name', None))

From 43f3c331b8ce828b610f794be9f7b8817a1c7e87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Tue, 20 Feb 2024 13:01:18 +0100
Subject: [PATCH 05/21] Shorten keys used in threat category attributes:   id
 -> c   role -> r   date -> d   n_reports -> src   subcategories -> s  
 confidence -> conf

---
 NERDd/blacklists.py                      |  4 +--
 NERDd/dshield.py                         |  2 +-
 NERDd/misp_receiver.py                   |  4 +--
 NERDd/modules/threat_category_summary.py | 36 ++++++++++++------------
 NERDd/otx_receiver.py                    |  4 +--
 NERDd/warden_receiver.py                 |  4 +--
 NERDweb/nerd_main.py                     | 28 +++++++++---------
 scripts/misp_updater.py                  |  4 +--
 8 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/NERDd/blacklists.py b/NERDd/blacklists.py
index bbc71fde..5ccbd5f9 100755
--- a/NERDd/blacklists.py
+++ b/NERDd/blacklists.py
@@ -212,8 +212,8 @@ def get_blacklist(id, name, url, regex, bl_type, life_length, params, categoriza
             ('setmax', '_ttl.bl', now_plus_life_length),
             ('array_upsert', 'bl', {'n': id},
                 [('set', 'v', 1), ('set', 't', download_time), ('append', 'h', download_time)]),
-            ('array_upsert', '_threat_category', {'date': download_time.strftime("%Y-%m-%d"), 'id': category["id"], 'role': category["role"]},
-                [('add', 'n_reports.blacklists', 1)], *subcategory_updates)
+            ('array_upsert', '_threat_category', {'d': download_time.strftime("%Y-%m-%d"), 'c': category["id"], 'r': category["role"]},
+                [('add', 'src.bl', 1)], *subcategory_updates)
         ], "blacklists")
 
 
diff --git a/NERDd/dshield.py b/NERDd/dshield.py
index 251c0e61..5c22aa36 100644
--- a/NERDd/dshield.py
+++ b/NERDd/dshield.py
@@ -93,7 +93,7 @@ def process_feed(feed_data):
              [('set', 'reports', ips[ip_addr]["reports"]),
               ('set', 'targets', ips[ip_addr]["targets"])]),
             ('setmax', '_ttl.dshield', ttl_date),
-            ('array_upsert', '_threat_category', {'date': date_str, 'id': 'scan', 'role': 'src'}, [('set', 'n_reports.dshield', ips[ip_addr]["reports"])])
+            ('array_upsert', '_threat_category', {'d': date_str, 'c': 'scan', 'r': 'src'}, [('set', 'src.dshield', ips[ip_addr]["reports"])])
         ], "dshield")
     logger.info("Tasks created")
 
diff --git a/NERDd/misp_receiver.py b/NERDd/misp_receiver.py
index 00156c5f..d0eb0afc 100644
--- a/NERDd/misp_receiver.py
+++ b/NERDd/misp_receiver.py
@@ -301,8 +301,8 @@ def upsert_new_event(event, attrib, sighting_list, role=None):
         updates.append((
             'array_upsert',
             '_threat_category',
-            {'date': category_data['date'], 'id': category_data['id'], 'role': category_data['role']},
-            [('add', 'n_reports.misp_receiver', 1), *subcategory_updates]
+            {'d': category_data['date'], 'c': category_data['id'], 'r': category_data['role']},
+            [('add', 'src.misp', 1), *subcategory_updates]
         ))
 
     logger.debug(f"Updates for {ip_addr}:")
diff --git a/NERDd/modules/threat_category_summary.py b/NERDd/modules/threat_category_summary.py
index 9166536b..b52aa6b9 100644
--- a/NERDd/modules/threat_category_summary.py
+++ b/NERDd/modules/threat_category_summary.py
@@ -55,7 +55,7 @@ def create_summary(self, ekey, rec, updates):
 
         grouped_by_category = {}
         for record in rec['_threat_category']:
-            cat = record['id']
+            cat = record['c']
             if cat not in grouped_by_category:
                 grouped_by_category[cat] = []
             grouped_by_category[cat].append(record)
@@ -66,43 +66,43 @@ def create_summary(self, ekey, rec, updates):
 
         for cat, records in grouped_by_category.items():
             cat_summary = {
-                'role': records[0]['role'],
-                'id': records[0]['id'],
-                'sources': {},
-                'subcategories': {}
+                'r': records[0]['r'],
+                'c': records[0]['c'],
+                'src': {},
+                's': {}
             }
             sources = {}
             subcategories = {}
             sum_weight = 0
             confidence = 0
             for record in deepcopy(records):
-                date = record['date']
+                date = record['d']
                 date = datetime.date(int(date[0:4]), int(date[5:7]), int(date[8:10]))
                 record_age_days = (today - date).days
                 if record_age_days >= DATE_RANGE:
                     continue
                 daily_reports = 0
-                for source in record['n_reports']:
+                for source in record['src']:
                     if source not in sources:
                         sources[source] = 0
-                    sources[source] += record['n_reports'][source]
-                    daily_reports += record['n_reports'][source]
-                daily_confidence = nonlin(daily_reports) * nonlin(len(record['n_reports']))
+                    sources[source] += record['src'][source]
+                    daily_reports += record['src'][source]
+                daily_confidence = nonlin(daily_reports) * nonlin(len(record['src']))
                 weight = float(DATE_RANGE - record_age_days) / DATE_RANGE
                 sum_weight += weight
                 confidence += daily_confidence * weight
-                del record['date']
-                del record['role']
-                del record['id']
-                del record['n_reports']
+                del record['d']
+                del record['r']
+                del record['c']
+                del record['src']
                 for key, values in record.items():
                     if key not in subcategories:
                         subcategories[key] = set()
                     subcategories[key].update(values)
             if confidence > 0:
-                cat_summary['confidence'] = round(confidence / sum_weight, 2)
-                cat_summary['sources'] = sources
-                cat_summary['subcategories'] = {k: list(v) for k, v in subcategories.items()}
+                cat_summary['conf'] = round(confidence / sum_weight, 2)
+                cat_summary['src'] = sources
+                cat_summary['s'] = {k: list(v) for k, v in subcategories.items()}
                 summary.append(cat_summary)
-        summary = sorted(summary, key=lambda rec: rec['confidence'], reverse=True)
+        summary = sorted(summary, key=lambda rec: rec['conf'], reverse=True)
         return [('set', '_threat_category_summary', summary)]
diff --git a/NERDd/otx_receiver.py b/NERDd/otx_receiver.py
index efaf7903..948b9a9c 100644
--- a/NERDd/otx_receiver.py
+++ b/NERDd/otx_receiver.py
@@ -166,8 +166,8 @@ def upsert_new_pulse(pulse, indicator):
         updates.append((
             'array_upsert',
             '_threat_category',
-            {'date': category_data['date'], 'id': category_data['id'], 'role': category_data['role']},
-            [('add', 'n_reports.otx_receiver', 1), *subcategory_updates]
+            {'d': category_data['date'], 'c': category_data['id'], 'r': category_data['role']},
+            [('add', 'src.otx', 1), *subcategory_updates]
         ))
 
     # put task in queue
diff --git a/NERDd/warden_receiver.py b/NERDd/warden_receiver.py
index ca9efc29..1ee98d0a 100644
--- a/NERDd/warden_receiver.py
+++ b/NERDd/warden_receiver.py
@@ -502,8 +502,8 @@ def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime,
                         updates.append((
                             'array_upsert',
                             '_threat_category',
-                            {'date': category_data['date'], 'id': category_data['id'], 'role': category_data['role']},
-                            [('add', 'n_reports.warden_receiver', 1), *subcategory_updates]
+                            {'d': category_data['date'], 'c': category_data['id'], 'r': category_data['role']},
+                            [('add', 'src.warden', 1), *subcategory_updates]
                         ))
 
                     # put task in queue
diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index 63d5502e..5a989f6a 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -982,13 +982,13 @@ def create_query(form):
     if form.tc_role.data or form.tc_category.data or form.tc_subcategory.data:
         if form.tc_role.data and len(form.tc_role.data) > 1 and form.tc_role_op.data == "and":
             for role in form.tc_role.data:
-                queries.append({"_threat_category": {"$elemMatch": {"role": role}}})
+                queries.append({"_threat_category": {"$elemMatch": {"r": role}}})
         else:
             query = {}
             elem_match = {}
             if form.tc_role.data:
                 role_op = '$and' if (form.tc_role_op.data == "and") else '$or'
-                elem_match.update({role_op: [{"role": role} for role in form.tc_role.data]})
+                elem_match.update({role_op: [{"r": role} for role in form.tc_role.data]})
             if form.tc_subcategory.data:
                 subcategory_id, subcategory_value = form.tc_subcategory.data.split("=")
                 if subcategory_id == "port":
@@ -996,7 +996,7 @@ def create_query(form):
                 elem_match.update({subcategory_id: subcategory_value})
             if form.tc_category.data:
                 cat_op = '$and' if (form.tc_category_op.data == "and") else '$or'
-                query = {cat_op: [{"_threat_category": {"$elemMatch": {**elem_match, "id": cat}}} for cat in form.tc_category.data]}
+                query = {cat_op: [{"_threat_category": {"$elemMatch": {**elem_match, "c": cat}}} for cat in form.tc_category.data]}
             else:
                 query = {"_threat_category": {"$elemMatch": elem_match}}
             queries.append(query)
@@ -1175,26 +1175,26 @@ def ips():
 
 def create_threat_category_table(category_records, min_confidence, max_subcategory_values):
     source_names = {
-        'warden_receiver': 'Warden',
-        'misp_receiver': 'MISP',
-        'otx_receiver': 'OTX',
+        'warden': 'Warden',
+        'misp': 'MISP',
+        'otx': 'OTX',
         'dshield': 'DShield',
-        'blacklists': 'Blacklists'
+        'bl': 'Blacklists'
     }
     table_rows = []
     for rec in category_records:
-        if rec['confidence'] < min_confidence:
+        if rec['conf'] < min_confidence:
             continue
-        category_description = threat_categorization_config.get(rec['id'], {}).get('description', f"ERROR: missing configuration for category '{rec['id']}'")
-        sources_str = ''.join([f"<li>{source_names[source]} ({n_reports})</li>" for source, n_reports in sorted(rec['sources'].items())])
-        tooltip_content = f"<b>{category_description}</b><br/><br/>Confidence: {rec['confidence']}<br/>Sources:<br/><ul>{sources_str}</ul>"
-        subcategories = list(rec['subcategories'].items())
+        category_description = threat_categorization_config.get(rec['c'], {}).get('description', f"ERROR: missing configuration for category '{rec['c']}'")
+        sources_str = ''.join([f"<li>{source_names[source]} ({n_reports})</li>" for source, n_reports in sorted(rec['src'].items())])
+        tooltip_content = f"<b>{category_description}</b><br/><br/>Confidence: {rec['conf']}<br/>Sources:<br/><ul>{sources_str}</ul>"
+        subcategories = list(rec['s'].items())
         if not subcategories:
-            table_rows.append([rec['role'], rec['id'], "", tooltip_content])
+            table_rows.append([rec['r'], rec['c'], "", tooltip_content])
         else:
             key, values = subcategories[0]
             subcategory_content = f"{key}: {', '.join(values)}" if len(values) <= max_subcategory_values else f"{key}: <i>many</i>"
-            table_rows.append([rec['role'], rec['id'], subcategory_content, tooltip_content])
+            table_rows.append([rec['r'], rec['c'], subcategory_content, tooltip_content])
             for item in subcategories[1:]:
                 key, values = item
                 subcategory_content = f"{key}: {', '.join(values)}" if len(values) <= max_subcategory_values else f"{key}: <i>many</i>"
diff --git a/scripts/misp_updater.py b/scripts/misp_updater.py
index 8a31767c..09dd6bda 100755
--- a/scripts/misp_updater.py
+++ b/scripts/misp_updater.py
@@ -416,8 +416,8 @@ def process_ip(ip_addr, ip_info):
                 update_requests += [(
                     'array_upsert',
                     '_threat_category',
-                    {'date': date, 'id': id, 'role': role},
-                    [('add', 'n_reports.misp_receiver', n_reports), *subcategory_updates]
+                    {'d': date, 'c': id, 'r': role},
+                    [('add', 'src.misp', n_reports), *subcategory_updates]
                 )]
 
     if events:

From 03cc1edf84801fa7fd19ecde82407a3cebd0b4c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 22 Feb 2024 07:16:58 +0100
Subject: [PATCH 06/21] Remove 'role' from threat category records (only needed
 in summary attribute)

---
 NERDd/blacklists.py                      |  2 +-
 NERDd/dshield.py                         |  2 +-
 NERDd/misp_receiver.py                   |  2 +-
 NERDd/modules/threat_category_summary.py | 11 ++++++++---
 NERDd/otx_receiver.py                    |  2 +-
 NERDd/warden_receiver.py                 |  2 +-
 etc/nerd.yml                             |  3 +++
 scripts/misp_updater.py                  |  2 +-
 8 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/NERDd/blacklists.py b/NERDd/blacklists.py
index 5ccbd5f9..9037516c 100755
--- a/NERDd/blacklists.py
+++ b/NERDd/blacklists.py
@@ -212,7 +212,7 @@ def get_blacklist(id, name, url, regex, bl_type, life_length, params, categoriza
             ('setmax', '_ttl.bl', now_plus_life_length),
             ('array_upsert', 'bl', {'n': id},
                 [('set', 'v', 1), ('set', 't', download_time), ('append', 'h', download_time)]),
-            ('array_upsert', '_threat_category', {'d': download_time.strftime("%Y-%m-%d"), 'c': category["id"], 'r': category["role"]},
+            ('array_upsert', '_threat_category', {'d': download_time.strftime("%Y-%m-%d"), 'c': category["id"]},
                 [('add', 'src.bl', 1)], *subcategory_updates)
         ], "blacklists")
 
diff --git a/NERDd/dshield.py b/NERDd/dshield.py
index 5c22aa36..fcc85d17 100644
--- a/NERDd/dshield.py
+++ b/NERDd/dshield.py
@@ -93,7 +93,7 @@ def process_feed(feed_data):
              [('set', 'reports', ips[ip_addr]["reports"]),
               ('set', 'targets', ips[ip_addr]["targets"])]),
             ('setmax', '_ttl.dshield', ttl_date),
-            ('array_upsert', '_threat_category', {'d': date_str, 'c': 'scan', 'r': 'src'}, [('set', 'src.dshield', ips[ip_addr]["reports"])])
+            ('array_upsert', '_threat_category', {'d': date_str, 'c': 'scan'}, [('set', 'src.dshield', ips[ip_addr]["reports"])])
         ], "dshield")
     logger.info("Tasks created")
 
diff --git a/NERDd/misp_receiver.py b/NERDd/misp_receiver.py
index d0eb0afc..0269ed12 100644
--- a/NERDd/misp_receiver.py
+++ b/NERDd/misp_receiver.py
@@ -301,7 +301,7 @@ def upsert_new_event(event, attrib, sighting_list, role=None):
         updates.append((
             'array_upsert',
             '_threat_category',
-            {'d': category_data['date'], 'c': category_data['id'], 'r': category_data['role']},
+            {'d': category_data['date'], 'c': category_data['id']},
             [('add', 'src.misp', 1), *subcategory_updates]
         ))
 
diff --git a/NERDd/modules/threat_category_summary.py b/NERDd/modules/threat_category_summary.py
index b52aa6b9..9ce25f80 100644
--- a/NERDd/modules/threat_category_summary.py
+++ b/NERDd/modules/threat_category_summary.py
@@ -5,10 +5,12 @@
 """
 
 from core.basemodule import NERDModule
+import common.config
 import g
 
 from copy import deepcopy
 import datetime
+import os
 
 
 def nonlin(val, coef=0.5, max=20):
@@ -25,6 +27,9 @@ class ThreatCategorySummary(NERDModule):
     """
 
     def __init__(self):
+        categorization_config_file = os.path.join(g.config_base_path, g.config.get("threat_categorization_config"))
+        self.config = common.config.read_config(categorization_config_file).get("threat_categorization", {})
+
         g.um.register_handler(
             self.create_summary,  # function (or bound method) to call
             'ip',  # entity type
@@ -65,9 +70,10 @@ def create_summary(self, ekey, rec, updates):
         summary = []
 
         for cat, records in grouped_by_category.items():
+            role = self.config[cat]['role']
             cat_summary = {
-                'r': records[0]['r'],
-                'c': records[0]['c'],
+                'r': role,
+                'c': cat,
                 'src': {},
                 's': {}
             }
@@ -92,7 +98,6 @@ def create_summary(self, ekey, rec, updates):
                 sum_weight += weight
                 confidence += daily_confidence * weight
                 del record['d']
-                del record['r']
                 del record['c']
                 del record['src']
                 for key, values in record.items():
diff --git a/NERDd/otx_receiver.py b/NERDd/otx_receiver.py
index 948b9a9c..080b4b41 100644
--- a/NERDd/otx_receiver.py
+++ b/NERDd/otx_receiver.py
@@ -166,7 +166,7 @@ def upsert_new_pulse(pulse, indicator):
         updates.append((
             'array_upsert',
             '_threat_category',
-            {'d': category_data['date'], 'c': category_data['id'], 'r': category_data['role']},
+            {'d': category_data['date'], 'c': category_data['id']},
             [('add', 'src.otx', 1), *subcategory_updates]
         ))
 
diff --git a/NERDd/warden_receiver.py b/NERDd/warden_receiver.py
index 1ee98d0a..b6ba9128 100644
--- a/NERDd/warden_receiver.py
+++ b/NERDd/warden_receiver.py
@@ -502,7 +502,7 @@ def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime,
                         updates.append((
                             'array_upsert',
                             '_threat_category',
-                            {'d': category_data['date'], 'c': category_data['id'], 'r': category_data['role']},
+                            {'d': category_data['date'], 'c': category_data['id']},
                             [('add', 'src.warden', 1), *subcategory_updates]
                         ))
 
diff --git a/etc/nerd.yml b/etc/nerd.yml
index dfb9f798..8fa9084d 100644
--- a/etc/nerd.yml
+++ b/etc/nerd.yml
@@ -68,6 +68,9 @@ dnsbl: dns_blacklists.yml
 # Configuration file for EventCountLogger
 event_logging_config: event_logging.yml
 
+# Threat categorization configuration file
+threat_categorization_config: threat_categorization.yml
+
 # EventDB type (where to store/read events), may be one of:
 #  'psql' - (default) Local PostgreSQL database (needs config in 'eventdb_psql' in nerdd.yml)
 #  'mentat' - External Mentat instance (no storage by NERD, load via Mentat API) (needs config in 'eventdb_mentat')
diff --git a/scripts/misp_updater.py b/scripts/misp_updater.py
index 09dd6bda..26a62812 100755
--- a/scripts/misp_updater.py
+++ b/scripts/misp_updater.py
@@ -416,7 +416,7 @@ def process_ip(ip_addr, ip_info):
                 update_requests += [(
                     'array_upsert',
                     '_threat_category',
-                    {'d': date, 'c': id, 'r': role},
+                    {'d': date, 'c': id},
                     [('add', 'src.misp', n_reports), *subcategory_updates]
                 )]
 

From a41e5d7ec31c4706263808b99da4e4571ba2b8f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 22 Feb 2024 07:21:08 +0100
Subject: [PATCH 07/21] Limit the maximum number of subcategory values stored
 in threat category attributes

---
 NERDd/modules/threat_category_summary.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/NERDd/modules/threat_category_summary.py b/NERDd/modules/threat_category_summary.py
index 9ce25f80..aef673de 100644
--- a/NERDd/modules/threat_category_summary.py
+++ b/NERDd/modules/threat_category_summary.py
@@ -58,6 +58,7 @@ def create_summary(self, ekey, rec, updates):
         if '_threat_category' not in rec:
             return None # No threat category records, nothing to do
 
+        subcategory_max_length = 10
         grouped_by_category = {}
         for record in rec['_threat_category']:
             cat = record['c']
@@ -65,6 +66,11 @@ def create_summary(self, ekey, rec, updates):
                 grouped_by_category[cat] = []
             grouped_by_category[cat].append(record)
 
+            # limit the number of subcategory values in each record
+            for key, values in record.items():
+                if type(record[key]) is list:
+                    record[key] = record[key][:subcategory_max_length]
+
         today = datetime.datetime.utcnow().date()
         DATE_RANGE = 14
         summary = []
@@ -107,7 +113,7 @@ def create_summary(self, ekey, rec, updates):
             if confidence > 0:
                 cat_summary['conf'] = round(confidence / sum_weight, 2)
                 cat_summary['src'] = sources
-                cat_summary['s'] = {k: list(v) for k, v in subcategories.items()}
+                cat_summary['s'] = {k: list(v)[:subcategory_max_length] for k, v in subcategories.items()}
                 summary.append(cat_summary)
         summary = sorted(summary, key=lambda rec: rec['conf'], reverse=True)
         return [('set', '_threat_category_summary', summary)]

From d96ef6fed9327b67c4f77aaa3fe1f4b22926f290 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 22 Feb 2024 09:10:21 +0100
Subject: [PATCH 08/21] Fix threat category keys in cleaner module

---
 NERDd/modules/cleaner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NERDd/modules/cleaner.py b/NERDd/modules/cleaner.py
index 5422605c..294baa3a 100644
--- a/NERDd/modules/cleaner.py
+++ b/NERDd/modules/cleaner.py
@@ -193,8 +193,8 @@ def clear_threat_category(self, ekey, rec, updates):
         # Remove all threat category records with day before cut_day
         actions = []
         for category_record in rec.get('_threat_category', []):
-            if category_record['date'] < cut_day:  # Thanks to ISO format it's OK to compare dates as strings
-                actions.append(('array_remove', '_threat_category', {'date': category_record['date'], 'id': category_record['id'], 'role': category_record['role']}))
+            if category_record['d'] < cut_day:  # Thanks to ISO format it's OK to compare dates as strings
+                actions.append(('array_remove', '_threat_category', {'d': category_record['d'], 'c': category_record['c']}))
 
         if actions:
             self.log.debug("Cleaning {}: Removing {} old threat category records".format(key, len(actions) - 1))

From dbbc82fc0e2cdcbba040b2fd56797363e8447a97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Mon, 18 Mar 2024 15:12:34 +0100
Subject: [PATCH 09/21] blacklists: use IP-specific info from blacklist records
 for better threat category classification

---
 NERDd/blacklists.py             | 79 ++++++++++-----------------------
 common/threat_categorization.py | 19 +++++++-
 etc/primary_blacklists.yml      |  4 +-
 etc/threat_categorization.yml   | 41 ++++++++++-------
 4 files changed, 66 insertions(+), 77 deletions(-)

diff --git a/NERDd/blacklists.py b/NERDd/blacklists.py
index 9037516c..b8af874f 100755
--- a/NERDd/blacklists.py
+++ b/NERDd/blacklists.py
@@ -40,49 +40,6 @@
     'ip': {'singular': "IP", 'plural': "IPs"}
 }
 
-###############################################################################
-# Threat categorization
-
-blacklist_to_category = None
-
-
-def categorization_init(categorization_config):
-    """
-    Create a blacklist -> category mapping based on the categorization config in 'etc/threat_categorization.yml'
-
-    :return:
-    """
-    global blacklist_to_category
-
-    blacklist_to_category = {}
-    for category_id, category_config in categorization_config["categories"].items():
-        category_triggers = category_config.get("triggers", {}).get("blacklists", "").split("\n")
-        for line in category_triggers:
-            split_line = line.split("->")
-            blacklist_id = split_line[0]
-            subcategories = {}
-            if len(split_line) > 1:
-                subcategories = ast.literal_eval(split_line[1].lstrip())
-            blacklist_to_category[blacklist_id] = {"id": category_id, "role": category_config["role"], "subcategories": subcategories}
-
-
-def classify_blacklist(blacklist_id, categorization_config):
-    """
-    Assign a threat category based on the blacklist -> category mapping created by categorization_init()
-
-    :return: Assigned category
-    """
-    global blacklist_to_category
-
-    if blacklist_to_category is None:
-        categorization_init(categorization_config)
-
-    if blacklist_id in blacklist_to_category:
-        return blacklist_to_category[blacklist_id]
-    else:
-        return {"role": "src", "id": "unknown", "subcategories": {}}
-
-
 ###############################################################################
 
 def compile_regex(regex):
@@ -110,14 +67,16 @@ def parse_bl_with_regex(bl_data, cregex):
             record_end = ip_match.span()[1]
             try:
                 # classic IP address blacklist
-                bl_records.append(str(ipaddress.IPv4Address(bl_data[record_start:record_end])))
+                bl_records.append((str(ipaddress.IPv4Address(bl_data[record_start:record_end])), None))
             except ipaddress.AddressValueError:
                 continue
     else:
         for line in bl_data.split('\n'):
             match = cregex.search(line)
             if match:
-                bl_records.append(str(ipaddress.IPv4Address(match.group(1))))
+                ip = str(ipaddress.IPv4Address(match.group(1)))
+                ip_info = match.group(2) if cregex.groups > 1 else None
+                bl_records.append((ip, ip_info))
     return bl_records
 
 
@@ -132,7 +91,7 @@ def parse_bl_without_regex(bl_data):
             ipaddr = ipaddress.IPv4Address(record)
         except ipaddress.AddressValueError:
             continue
-        bl_records.append(str(ipaddr))
+        bl_records.append((str(ipaddr), None))
     return bl_records
 
 
@@ -202,19 +161,27 @@ def get_blacklist(id, name, url, regex, bl_type, life_length, params, categoriza
 
     log.info("{} IPs found in '{}', sending tasks to NERD workers".format(len(bl_records), id))
 
-    category = classify_blacklist(id, categorization_config)
-    subcategory_updates = []
-    for subcategory, values in category['subcategories'].items():
-        subcategory_updates.append(('extend_set', subcategory, values))
-
-    for ip in bl_records:
-        task_queue_writer.put_task('ip', ip, [
+    for ip, ip_info in bl_records:
+        updates = [
             ('setmax', '_ttl.bl', now_plus_life_length),
             ('array_upsert', 'bl', {'n': id},
                 [('set', 'v', 1), ('set', 't', download_time), ('append', 'h', download_time)]),
-            ('array_upsert', '_threat_category', {'d': download_time.strftime("%Y-%m-%d"), 'c': category["id"]},
-                [('add', 'src.bl', 1)], *subcategory_updates)
-        ], "blacklists")
+        ]
+
+        # threat categorization updates
+        for category_data in classify_ip(ip, "blacklists", log, categorization_config, id, ip_info, download_time):
+            subcategory_updates = []
+            for subcategory, values in category_data['subcategories'].items():
+                subcategory_updates.append(('extend_set', subcategory, values))
+            updates.append((
+                'array_upsert',
+                '_threat_category',
+                {'d': category_data['date'], 'c': category_data['id']},
+                [('add', 'src.bl', 1), *subcategory_updates]
+            ))
+
+        # put task in queue
+        task_queue_writer.put_task('ip', ip, updates, "blacklists")
 
 
 def stop(signal, frame):
diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index 0d05ef8d..32919696 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -89,6 +89,21 @@ def init_misp_receiver(self, event, attrib, ip_role):
         except ValueError:
             pass
 
+    def init_blacklists(self, blacklist_id, ip_info, download_time):
+        """
+        Fill in metadata from a blacklist record
+        :param blacklist_id: ID of the blacklist
+        :param ip_info: Additional info about the IP
+        :param download_time: Time when the blacklist was downloaded
+        :return:
+        """
+        self.date = download_time.strftime("%Y-%m-%d")
+        self.description = blacklist_id
+        self.ip_info = str(ip_info)
+        self.description = ""
+        self.protocols = []
+        self.target_ports = []
+
 
 def classify_ip(ip_addr, module_name, logger, config, *args):
     """
@@ -116,8 +131,8 @@ def classify_ip(ip_addr, module_name, logger, config, *args):
         logger.error(f"Error in threat category classification for IP {ip_addr}: {e}")
     if not output:
         output.append({"date": event.date, "id": "unknown", "role": "src", "subcategories": {}})
-        with open(f"/var/log/nerd/threat_categorization_unknown.log", "a+") as logfile:
-           logfile.write(f"[{datetime.now()}] MODULE: {module_name} IP: {ip_addr} EVENT-INFO: {event}\n")
+    #   with open(f"/var/log/nerd/threat_categorization_unknown.log", "a+") as logfile:
+    #      logfile.write(f"[{datetime.now()}] MODULE: {module_name} IP: {ip_addr} EVENT-INFO: {event}\n")
     logger.debug(f"Threat category classification for {ip_addr}: {output}; Event info: {event}")
     return output
 
diff --git a/etc/primary_blacklists.yml b/etc/primary_blacklists.yml
index bbcca081..8197b4da 100644
--- a/etc/primary_blacklists.yml
+++ b/etc/primary_blacklists.yml
@@ -182,7 +182,7 @@ iplists:
   firehol_link: http://iplists.firehol.org/?ipset=blocklist_net_ua
   provider_link: https://blocklist.net.ua/about/
   url: https://blocklist.net.ua/blocklist.csv
-  regex: "^(\\A)"
+  regex: "^(\\A);.*;.*;(.*)"
   # The time of updating the list was viewed here: https://blocklist.net.ua/providers/  
   time:
     hour: "*/4"
@@ -272,7 +272,7 @@ iplists:
   descr: ThreatFox is a free platform from abuse.ch with the goal of<br>sharing indicators of compromise (IOCs) associated with malware with the<br>infosec community, AV vendors and threat intelligence providers.
   provider_link: https://threatfox.abuse.ch/
   url: https://threatfox.abuse.ch/export/csv/ip-port/recent/
-  regex: ".*?(\\A)"
+  regex: ".*?(\\A),(.*)"
   # The time of updating the list was viewed here: https://threatfox.abuse.ch/export/
   time:
     hour: "*/4"
diff --git a/etc/threat_categorization.yml b/etc/threat_categorization.yml
index b22e5746..29455c1a 100644
--- a/etc/threat_categorization.yml
+++ b/etc/threat_categorization.yml
@@ -95,14 +95,17 @@ threat_categorization:
         'CERT-XLM:intrusion-attempts="login-attempts"' in event.tags
         'ecsirt:intrusion-attempts="brute-force"' in event.tags
       blacklists: |-
-        blocklist_de-ssh -> {'protocol': ['ssh']}
-        charles_the_haleys_ssh_dico_ips -> {'protocol': ['ssh']}
-        charles_the_haleys_smtp_dico_ips -> {'protocol': ['smtp']}
-        dataplane_org_sshclient -> {'protocol': ['ssh']}
-        dataplane_org_sshpwauth -> {'protocol': ['ssh']}
-        dataplane_org_telnet_login -> {'protocol': ['telnet']}
-        bruteforceblocker
-        blocklist_de-bruteforcelogin
+        event.description == 'blocklist_de-ssh' -> {'protocol': ['ssh']}
+        event.description == 'charles_the_haleys_ssh_dico_ips' -> {'protocol': ['ssh']}
+        event.description == 'charles_the_haleys_smtp_dico_ips' -> {'protocol': ['smtp']}
+        event.description == 'dataplane_org_sshclient' -> {'protocol': ['ssh']}
+        event.description == 'dataplane_org_sshpwauth' -> {'protocol': ['ssh']}
+        event.description == 'dataplane_org_telnet_login' -> {'protocol': ['telnet']}
+        event.description == 'bruteforceblocker'
+        event.description == 'blocklist_de-bruteforcelogin'
+        match_str('Brute force passwords using SSH', event.ip_info) -> {'protocol': ['ssh']}
+        match_str('Brute force passwords using FTP', event.ip_info) -> {'protocol': ['ftp']}
+        match_str('Brute force passwords to SIP', event.ip_info) -> {'protocol': ['sip']}
 
   ddos:
     role: src
@@ -120,6 +123,9 @@ threat_categorization:
         'ecsirt:availability="dos"' in event.tags
         'ecsirt:availability="ddos"' in event.tags
         'circl:incident-classification="denial-of-service"' in event.tags
+      blacklists: |-
+        match_str('HTTP flood', event.ip_info) -> {'protocol': ['http']}
+        match_str('DDoS', event.ip_info)
 
   ddos-amplifier:
     role: dst
@@ -154,9 +160,10 @@ threat_categorization:
         'ecsirt:abusive-content="spam"' in event.tags
         'circl:incident-classification="spam"' in event.tags
       blacklists: |-
-        sblam_ips
-        psbl
-        spamhaus_edrop
+        event.description == 'sblam_ips'
+        event.description == 'psbl'
+        event.description == 'spamhaus_edrop'
+        match_str('Send spam', event.ip_info)
 
   malware_distribution:
     role: dst
@@ -182,7 +189,7 @@ threat_categorization:
         'circl:incident-classification="malware"' in event.tags
         'ecsirt:malicious-code="malware"' in event.tags
       blacklists: |-
-        urlhouse_ips
+        event.description == 'urlhouse_ips'
 
   cc:
     role: dst
@@ -204,8 +211,8 @@ threat_categorization:
         'kill-chain:Command and Control' in event.tags
         'ecsirt:malicious-code="c&c"' in event.tags
       blacklists: |-
-        feodo
-        bambenek_c2
+        event.description == 'feodo'
+        event.description == 'bambenek_c2'
 
   botnet_drone:
     role: src
@@ -221,7 +228,7 @@ threat_categorization:
         'CERT-XLM:intrusion="botnet-member"' in event.tags
         'ecsirt:malicious-code="botnet-drone"' in event.tags
       blacklists: |-
-        mirai_tracker_ips -> {'malware_family': ['elf.mirai']}
+        event.description == 'mirai_tracker_ips' -> {'malware_family': ['elf.mirai']}
 
   phishing_site:
     role: dst
@@ -235,7 +242,7 @@ threat_categorization:
       misp_receiver: |-
         any([match_str('phishing', tag) for tag in event.tags])
       blacklists: |-
-        openphish
+        event.description == 'openphish'
 
   exploit:
     role: src
@@ -255,4 +262,4 @@ threat_categorization:
         any([match_str('exploit', tag) for tag in event.tags])
         'CERT-XLM:intrusion-attempts="new-attack-signature"' in event.tags
         'circl:incident-classification="XSS"' in event.tags
-        'circl:incident-classification="sql-injection"' in event.tags
+        'circl:incident-classification="sql-injection"' in event.tags
\ No newline at end of file

From 506fc985d72da1ef63d9f156f9bac6850a6612a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 4 Apr 2024 07:50:12 +0200
Subject: [PATCH 10/21] web: improved search by threat category, fixed CSS

---
 NERDweb/nerd_main.py       | 49 ++++++++++++-------------
 NERDweb/static/ips.js      |  1 -
 NERDweb/static/style.css   | 61 +++++++++++++++++++++++++++++--
 NERDweb/templates/ips.html | 73 +++++++++++++++++++++++++++-----------
 4 files changed, 133 insertions(+), 51 deletions(-)

diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index 5a989f6a..820730c7 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -843,12 +843,12 @@ class IPFilterForm(FlaskForm):
     cat = SelectMultipleField('Event category', [validators.Optional()]) # Choices are set up dynamically (see below)
     tc_role = SelectMultipleField('Role', [validators.Optional()])
     tc_category = SelectMultipleField('Category', [validators.Optional()])
-    tc_subcategory = StringField('Subcategory', [
-        validators.Optional(),
-        validators.Regexp('^(\w+)=(\w+)$', re.IGNORECASE)
-    ], filters=[strip_whitespace, to_lower])
+    tc_subcategory_key = SelectField('Subcategory key', [validators.Optional()])
+    tc_subcategory_value = StringField('Subcategory value', [validators.Optional()], filters=[strip_whitespace, to_lower])
+    tc_confidence = FloatField('Min category confidence',
+                          [validators.Optional(), validators.NumberRange(0, 1, 'Must be a number between 0 and 1')],
+                          default=0.5)
     cat_op = HiddenField('', default="or")
-    tc_role_op = HiddenField('', default="or")
     tc_category_op = HiddenField('', default="or")
     node = SelectMultipleField('', [validators.Optional()])
     node_op = HiddenField('', default="or")
@@ -900,6 +900,7 @@ def __init__(self, *args, **kwargs):
         # Load categorization config to get list of all categories
         self.tc_role.choices = [("src", "Source"), ("dst", "Destination")]
         self.tc_category.choices = sorted([(cat_id, cat_data['label']) for cat_id, cat_data in threat_categorization_config.items()])
+        self.tc_subcategory_key.choices = [("", "--"), ("port", "Port"), ("protocol", "Protocol"), ("malware_family", "Malware family")]
 
         # Number of occurrences for blacklists (list of blacklists is taken from configuration)
         bl_name2num = {item['_id']: int(item['n']) for item in mongo.db.n_ip_by_bl.find()}
@@ -979,27 +980,20 @@ def create_query(form):
     if form.cat.data:
         op = '$and' if (form.cat_op.data == "and") else '$or'
         queries.append({op: [{'events.cat': cat} for cat in form.cat.data]})
-    if form.tc_role.data or form.tc_category.data or form.tc_subcategory.data:
-        if form.tc_role.data and len(form.tc_role.data) > 1 and form.tc_role_op.data == "and":
-            for role in form.tc_role.data:
-                queries.append({"_threat_category": {"$elemMatch": {"r": role}}})
+    if form.tc_role.data or form.tc_category.data or form.tc_subcategory_value.data:
+        elem_match = {}
+        if form.tc_confidence.data:
+            elem_match.update({"conf": {"$gte": float(form.tc_confidence.data)}})
+        if form.tc_role.data:
+            elem_match.update({'$or': [{"r": role} for role in form.tc_role.data]})
+        if form.tc_subcategory_key.data and form.tc_subcategory_value.data:
+            elem_match.update({f"s.{form.tc_subcategory_key.data}": form.tc_subcategory_value.data})
+        if form.tc_category.data:
+            cat_op = '$and' if (form.tc_category_op.data == "and") else '$or'
+            query = {cat_op: [{"_threat_category_summary": {"$elemMatch": {**elem_match, "c": cat}}} for cat in form.tc_category.data]}
         else:
-            query = {}
-            elem_match = {}
-            if form.tc_role.data:
-                role_op = '$and' if (form.tc_role_op.data == "and") else '$or'
-                elem_match.update({role_op: [{"r": role} for role in form.tc_role.data]})
-            if form.tc_subcategory.data:
-                subcategory_id, subcategory_value = form.tc_subcategory.data.split("=")
-                if subcategory_id == "port":
-                    subcategory_value = int(subcategory_value)
-                elem_match.update({subcategory_id: subcategory_value})
-            if form.tc_category.data:
-                cat_op = '$and' if (form.tc_category_op.data == "and") else '$or'
-                query = {cat_op: [{"_threat_category": {"$elemMatch": {**elem_match, "c": cat}}} for cat in form.tc_category.data]}
-            else:
-                query = {"_threat_category": {"$elemMatch": elem_match}}
-            queries.append(query)
+            query = {"_threat_category_summary": {"$elemMatch": elem_match}}
+        queries.append(query)
     if form.node.data:
         op = '$and' if (form.node_op.data == "and") else '$or'
         queries.append({op: [{'events.node': node} for node in form.node.data]})
@@ -1163,7 +1157,8 @@ def ips():
             ip['_showable_misp_events'] = showable_misp_events
 
             # Add info about threat category
-            ip['_threat_category_table'] = create_threat_category_table(ip.get('_threat_category_summary', []), 0.25, 10)
+            min_confidence = float(form.tc_confidence.data) if form.tc_confidence.data else 0
+            ip['_threat_category_table'] = create_threat_category_table(ip.get('_threat_category_summary', []), min_confidence, 9)
     else:
         results = None
         form.ip_list.data = ""
@@ -1384,7 +1379,7 @@ def ip(ipaddr=None):
                 ipinfo['asns'] = asn_list
 
                 # Create threat category table
-                threat_category_table = create_threat_category_table(ipinfo.get('_threat_category_summary', []), 0, 10)
+                threat_category_table = create_threat_category_table(ipinfo.get('_threat_category_summary', []), 0, 9)
 
                 # Pseudonymize node names if user is not allowed to see the original names
                 if not g.ac('nodenames'):
diff --git a/NERDweb/static/ips.js b/NERDweb/static/ips.js
index a9fc6f52..6600fe58 100644
--- a/NERDweb/static/ips.js
+++ b/NERDweb/static/ips.js
@@ -108,7 +108,6 @@ function set_up_search_form() {
 
     set_up_op_button("#source_op_button", "#source_op", "OR: At least one of the selected categories", "AND: All selected categories")
     set_up_op_button("#cat_op_button", "#cat_op", "OR: At least one of the selected categories", "AND: All selected categories")
-    set_up_op_button("#tc_role_op_button", "#tc_role_op", "OR: At least one of the selected roles", "AND: All selected roles")
     set_up_op_button("#tc_category_op_button", "#tc_category_op", "OR: At least one of the selected categories", "AND: All selected categories")
     set_up_op_button("#node_op_button", "#node_op", "OR: At least one of the selected nodes", "AND: All selected nodes")
     set_up_op_button("#bl_op_button", "#bl_op", "OR: At least one of the selected blacklists", "AND: All selected blacklists")
diff --git a/NERDweb/static/style.css b/NERDweb/static/style.css
index c2572318..7f18ee0c 100644
--- a/NERDweb/static/style.css
+++ b/NERDweb/static/style.css
@@ -489,7 +489,7 @@ td.country a {
   border: hidden;
 }
 .threat_category_preview table td {
-  min-width: 1.3em;
+  min-width: 1.5em;
   padding: 0 0.2em;
   border-width: 0px 0px 0px 0px;
 }
@@ -497,6 +497,7 @@ td.country a {
   color: #222;
   border-left: 2px dotted #888;
   padding: 0 0.5em;
+  min-width: 9.5em;
 }
 
 .threat_category_detail table {
@@ -1284,13 +1285,69 @@ ul.data-list li ul li {
     flex-direction: row;
     width: 813px;
     justify-content: space-between;
+    padding-bottom: 25px;
   }
 
   #threat_category p
   {
     font-weight: bold;
+    padding-right: 10px;
+  }
+
+  #tc_role_label
+  {
+    display: block;
+    padding-bottom: 3px;
+  }
+
+  #tc_role_wrap,
+  #tc_role_wrap button,
+  #tc_role_wrap .ms-options,
+  #tc_role_wrap .ms-options-wrap span
+  {
+    width: 115px;
+  }
+
+  #tc_category_label
+  {
+    display: block;
+    padding-bottom: 3px;
+  }
+
+  #tc_category_wrap,
+  #tc_category_wrap button,
+  #tc_category_wrap .ms-options,
+  #tc_category_wrap .ms-options-wrap span
+  {
+    width: 170px;
+  }
+
+  #tc_subcategory_label
+  {
+    display: block;
+    padding-bottom: 3px;
+  }
+
+  #tc_subcategory_wrap
+  {
+    width: 222px;
+  }
+
+  #tc_subcategory_wrap select
+  {
+    width: 115px;
+  }
+
+  #tc_confidence_label
+  {
+    display: block;
+    padding-bottom: 3px;
   }
 
+  #tc_confidence_wrap
+  {
+    width: 90px
+  }
 
   #searchForm
   {
@@ -1357,7 +1414,7 @@ ul.data-list li ul li {
 
   #narrow .ms-options-wrap span
   {
-    width: 500px;
+    width: 180px;
   }
 
   .center-row
diff --git a/NERDweb/templates/ips.html b/NERDweb/templates/ips.html
index bcec2e10..fb9c7b06 100644
--- a/NERDweb/templates/ips.html
+++ b/NERDweb/templates/ips.html
@@ -155,27 +155,58 @@ <h1>Search IP addresses by ...</h1>
     </div>
       {% endif %}
 
-      <div class="row narrow" id="narrow">
-        <div id="sorting">
-          <div>
-            <p>Threat category</p>
-          </div>
-          <div class="opt">
-            <span class="lab">
-              Role
-              <span id="tc_role_op_button" class="op_button"><div class="or selected"><span>OR</span></div> <div class="and"><span>AND</span></div></span></span>
-             {{ form.tc_role() }}{{ form.tc_role_op() }}
-          </div>
-          <div class="opt">
-            <span class="lab">
-              Category
-              <span id="tc_category_op_button" class="op_button"><div class="or selected"><span>OR</span></div> <div class="and"><span>AND</span></div></span></span>
-             {{ form.tc_category() }}{{ form.tc_category_op() }}
-          </div>
-          <div class="opt">
-            <span class="lab">Subcategory</span>
-            {{ formfield(form.tc_subcategory, size=10) }}
-          </div>
+      <div id="threat_category">
+        <div>
+          <p>Threat category</p>
+        </div>
+        <div class="opt" id="tc_role_wrap">
+          <span id="tc_role_label">
+            Role
+            <div class="tooltip">
+              <i class="fa fa-question-circle-o" aria-hidden="true"></i>
+              <div class="tooltip-text">
+                Select IP addresses with threat category records matching the selected role.
+              </div>
+            </div>
+          </span>
+           {{ form.tc_role() }}
+        </div>
+        <div class="opt" id="tc_category_wrap">
+          <span id="tc_category_label">
+            Category
+            <div class="tooltip">
+              <i class="fa fa-question-circle-o" aria-hidden="true"></i>
+              <div class="tooltip-text">
+                Select IP addresses with threat category records matching the selected category.
+              </div>
+            </div>
+            <span id="tc_category_op_button" class="op_button"><div class="or selected"><span>OR</span></div> <div class="and"><span>AND</span></div></span>
+          </span>
+           {{ form.tc_category() }}{{ form.tc_category_op() }}
+        </div>
+        <div class="opt" id="tc_subcategory_wrap">
+          <span id="tc_subcategory_label">
+            Subcategory
+            <div class="tooltip">
+              <i class="fa fa-question-circle-o" aria-hidden="true"></i>
+              <div class="tooltip-text">
+                Select IP addresses with threat category records matching the selected subcategory.
+              </div>
+            </div>
+          </span>
+          {{ form.tc_subcategory_key() }} = {{ formfield(form.tc_subcategory_value, size=8) }}
+        </div>
+        <div class="opt" id="tc_confidence_wrap">
+          <span id="tc_confidence_label">
+            Confidence
+            <div class="tooltip">
+              <i class="fa fa-question-circle-o" aria-hidden="true"></i>
+              <div class="tooltip-text">
+                Minimum category confidence.
+              </div>
+            </div>
+          </span>
+          {{ formfield(form.tc_confidence, size=8) }}
         </div>
       </div>
 

From 08d75400494bca02dd6b91d92768837bc04a1b9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 4 Apr 2024 08:54:39 +0200
Subject: [PATCH 11/21] threat_categorization: do not store empty subcategory
 arrays in threat category records

---
 common/threat_categorization.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index 32919696..01d85167 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -173,6 +173,10 @@ def eval_trigger(trigger, event, category_params, config, logger):
                 if match_str(family_data["common_name"], text):
                     subcategories["malware_family"].append(family_id.lower())
             subcategories["malware_family"] = list(set(subcategories["malware_family"]))
+
+    for key in list(subcategories):
+        if not subcategories[key]:
+            subcategories.pop(key)
     return result, subcategories
 
 

From cf515345e9b8a718f3e10eb6cd9ffed6c96d7294 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 4 Apr 2024 10:52:52 +0200
Subject: [PATCH 12/21] blacklists: added Crowdsec blacklist

The blacklist is created by exporting CAPI decisions from the Crowdsec DB at logs.liberouter.org
---
 etc/primary_blacklists.yml | 10 ++++++++++
 install/cron/nerd          |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/etc/primary_blacklists.yml b/etc/primary_blacklists.yml
index 8197b4da..732319be 100644
--- a/etc/primary_blacklists.yml
+++ b/etc/primary_blacklists.yml
@@ -544,3 +544,13 @@ iplists:
 #  time:
 #    hour: 1,9,17
 #    minute: 45
+#
+#- id: crowdsec
+#  name: Crowdsec
+#  descr: Crowdsec community blacklist of malicious IPs.
+#  provider_link: https://docs.crowdsec.net/
+#  url: file:///data/blacklists/crowdsec.csv
+#  regex: "^(\\A),(.*)"
+#  time:
+#    hour: 1,9,17
+#    minute: 45
diff --git a/install/cron/nerd b/install/cron/nerd
index e3910a71..36d43e1d 100644
--- a/install/cron/nerd
+++ b/install/cron/nerd
@@ -23,6 +23,8 @@
 40 01,09,17 * * * nerd rsync -azq rsync-mirrors.uceprotect.net::RBLDNSD-ALL/dnsbl-1.uceprotect.net /data/blacklists/uceprotect-level1
 # rsync PSBL blacklist 3 times a day
 41 01,09,17 * * * nerd rsync -zq psbl-mirror.surriel.com::psbl/psbl.txt /data/blacklists/psbl.txt
+# rsync Crowdsec blacklist 3 times a day
+42 01,09,17 * * * nerd rsync -zq logs.liberouter.org::crowdsec/crowdsec_blacklist.csv /data/blacklists/crowdsec.csv
 
 # Check Apache log for 5xx errors every hour. If grep produces output, it's sent to the email contact.
 # Run at the end of every hour and simply filter all log lines with the current hour (not perfect, but simple)

From 1561a3cb9b77612ba10c2d8059a2a4db4bddd370 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Mon, 15 Apr 2024 14:19:48 +0200
Subject: [PATCH 13/21] web: Added downloadable files with IP categories and
 category blacklists

---
 NERDweb/nerd_main.py                    |  8 +++-
 NERDweb/static/style.css                |  9 ++++
 NERDweb/templates/data.html             | 30 +++++++++++-
 install/cron/nerd                       |  4 ++
 scripts/generate_category_blocklists.sh | 51 +++++++++++++++++++++
 scripts/generate_ip_category_list.sh    | 61 +++++++++++++++++++++++++
 6 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 scripts/generate_category_blocklists.sh
 create mode 100644 scripts/generate_ip_category_list.sh

diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index 820730c7..39af5cfe 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -1721,8 +1721,14 @@ def map_index():
     "ip_rep.csv",
     "bad_ips.txt",
     "bad_ips_med_conf.txt",
+    "ip_category.csv",
+    "ip_category_table.csv",
 ]
 
+# Add category blacklist files (created by /scripts/generate_category_blocklist.sh)
+BL_FILES = [f"bl_{cat}.txt" for cat in threat_categorization_config if cat != "unknown"]
+FILES += BL_FILES
+
 @app.route('/data/')
 def data_index():
     log_ep.log('/data')
@@ -1736,7 +1742,7 @@ def data_index():
             file_sizes[f] = os.stat(os.path.join(DATA_DIR, f)).st_size
         except OSError:
             file_sizes[f] = None
-    return render_template("data.html", title=title, file_sizes=file_sizes)
+    return render_template("data.html", title=title, file_sizes=file_sizes, bl_files=BL_FILES)
 
 @app.route('/data/<filename>')
 def data_file(filename):
diff --git a/NERDweb/static/style.css b/NERDweb/static/style.css
index 7f18ee0c..8f2c98be 100644
--- a/NERDweb/static/style.css
+++ b/NERDweb/static/style.css
@@ -14,6 +14,11 @@ h1 {
   margin-bottom: 0.25em;
 }
 
+h2 {
+  font-size: larger;
+  margin-top: 1.5em;
+}
+
 hr {
   border: 0;
   border-top: 2px solid #0061a2;
@@ -1041,6 +1046,10 @@ ul.data-list li ul li {
   color: #777;
 }
 
+.data-list+p {
+  margin-top: 2em;
+}
+
 
 /***********************************************/
 /* NERD status block */
diff --git a/NERDweb/templates/data.html b/NERDweb/templates/data.html
index a80035ca..9cff3a5c 100644
--- a/NERDweb/templates/data.html
+++ b/NERDweb/templates/data.html
@@ -3,6 +3,7 @@
 
 <h1>Downloadable data</h1>
 
+<h2>Reputation score</h2>
 <ul class="data-list">
 <li>List of all IPs in NERD database with their reputation scores:
 {% if file_sizes['ip_rep.csv'] is number %}<a href="{{ url_for("data_file", filename="ip_rep.csv") }}">ip_rep.csv</a> ({{file_sizes['ip_rep.csv']|filesizeformat}}){% else %}<span class="error">ERROR File not found</span>{% endif %}
@@ -18,6 +19,33 @@ <h1>Downloadable data</h1>
 </li>
 </ul>
 
-All files are updated once per hour.
+<h2>Threat categorization</h2>
+<ul class="data-list">
+  <li>
+    List of all IPs in NERD with their categories:
+    <ul>
+      <li>
+        Line format:
+        {% if file_sizes['ip_category.csv'] is number %}<a href="{{ url_for("data_file", filename="ip_category.csv") }}">ip_category.csv</a> ({{file_sizes['ip_category.csv']|filesizeformat}}){% else %}<span class="error">ERROR File not found</span>{% endif %}
+      </li>
+      <li>
+        Table format:
+        {% if file_sizes['ip_category_table.csv'] is number %}<a href="{{ url_for("data_file", filename="ip_category_table.csv") }}">ip_category_table.csv</a> ({{file_sizes['ip_category_table.csv']|filesizeformat}}){% else %}<span class="error">ERROR File not found</span>{% endif %}
+      </li>
+    </ul>
+  </li>
+  <li>
+    Collection of blacklists for each category (IPs with confidence greater than 0.5):
+    <ul>
+      {% for bl_file in bl_files %}
+      <li>
+        {% if file_sizes[bl_file] is number %}<a href="{{ url_for("data_file", filename=bl_file) }}">{{bl_file}}</a> ({{file_sizes[bl_file]|filesizeformat}}){% else %}<span class="error">ERROR File not found</span>{% endif %}
+      </li>
+      {% endfor %}
+    </ul>
+  </li>
+</ul>
+
+<p>All files are updated once per hour.</p>
 
 {% endblock %}
diff --git a/install/cron/nerd b/install/cron/nerd
index 36d43e1d..59bafb39 100644
--- a/install/cron/nerd
+++ b/install/cron/nerd
@@ -10,6 +10,10 @@
 00 * * * * nerd /nerd/scripts/generate_blocklist.sh 0.5 | sort -n > /data/web_data/bad_ips.txt
 00 * * * * nerd /nerd/scripts/generate_blocklist.sh 0.2 | sort -n > /data/web_data/bad_ips_med_conf.txt
 
+# Generate list of IPs and threat categories every hour
+00 * * * * nerd /nerd/scripts/generate_ip_category_list.sh /data/web_data
+00 * * * * nerd /nerd/scripts/generate_category_blocklists.sh 0.5 /data/web_data
+
 # Remove old IDEA messages from PostgreSQL every day at 03:00
 # (enable if local PSQL is used to store alerts from Warden)
 #0 03 * * * nerd /nerd/scripts/nerd_clean_eventdb.sh > /dev/null
diff --git a/scripts/generate_category_blocklists.sh b/scripts/generate_category_blocklists.sh
new file mode 100644
index 00000000..f8be84a7
--- /dev/null
+++ b/scripts/generate_category_blocklists.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Generate blocklists for each threat category (IPs with confidence higher than a given threshold)
+# Intended use is to generate a blocklist/TI feed for download.
+#
+# Takes two parameters - confidence threshold (default = 0.5) and output directory
+#
+# Output a plain-text file with one IP per line (and a comment in the beginning)
+
+if [[ -z "$1" ]]; then
+  thr=0.5 # no/emtpy parameter, use default
+elif [[ "$1" =~ ^0[.][0-9]+$ ]]; then
+  thr="$1"
+else
+  echo "ERROR" # This will be content of the generated blocklist
+  echo "ERROR: Threshold must be a number between 0 and 1" >&2 # Error message to stderr
+  exit 1
+fi
+
+out_dir="${2%/}"
+if [[ -z "$out_dir" ]]; then
+  out_dir=/data/web-data # no/emtpy parameter, use default
+elif [ ! -d "$out_dir" ]; then
+  # Create the output directory if it doesn't exist
+  mkdir -p "$out_dir"
+fi
+
+# List of category IDs
+# TODO load category ids dynamically from categorization config
+declare -a categories=(
+  "botnet_drone"
+  "bruteforce"
+  "cc"
+  "ddos"
+  "ddos-amplifier"
+  "exploit"
+  "malware_distribution"
+  "phishing_site"
+  "scan"
+  "spam"
+)
+
+for category in "${categories[@]}"; do
+  echo "# Generated at $(date -u '+%Y-%m-%d %H:%M UTC')" > "$out_dir/bl_$category.txt"
+  mongosh nerd --quiet --eval '
+    function int2ip (ipInt) {
+      return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
+  }
+  db.ip.find({"_threat_category_summary": {$elemMatch: {"c": "'$category'", "conf": {$gt: '$thr'}}}, "tags.whitelist": {$exists: false}}, {_id: 1}).sort({"_threat_category_summary.conf": -1}).forEach( function(rec) { print(int2ip(rec._id)); } );
+  ' | grep -v "^$" | sort -n >> "$out_dir/bl_$category.txt.tmp"
+  mv "$out_dir/bl_$category.txt"{.tmp,}
+done
diff --git a/scripts/generate_ip_category_list.sh b/scripts/generate_ip_category_list.sh
new file mode 100644
index 00000000..bf8fccf1
--- /dev/null
+++ b/scripts/generate_ip_category_list.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+# Generate list of all IPs in NERD's database with their categories
+# Takes one parameter - output directory
+
+out_dir="${1%/}"
+if [[ -z "$out_dir" ]]; then
+  out_dir=/data/web-data # no/emtpy parameter, use default
+elif [ ! -d "$out_dir" ]; then
+  # Create the output directory if it doesn't exist
+  mkdir -p "$out_dir"
+fi
+
+# Line format
+out_file="$out_dir/ip_category.csv"
+echo "# Generated at $(date -u '+%Y-%m-%d %H:%M UTC')" > "$out_file.tmp"
+echo "# IP,Category,Confidence" >> "$out_file.tmp"
+mongosh nerd --quiet --eval '
+function int2ip (ipInt) {
+  return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
+}
+db.ip.aggregate([
+  { $unwind: "$_threat_category_summary" },
+  { $set: { category: "$_threat_category_summary.c", confidence: { $toString: "$_threat_category_summary.conf" } } },
+  { $project: { _id: 1, category: 1, confidence: 1 } }
+]).forEach(function(rec) { print(int2ip(rec._id) + "," + rec.category + "," + rec.confidence); })' | grep -v "^$" | sort -n >> "$out_file.tmp"
+mv "$out_file"{.tmp,}
+
+# Table format
+# TODO load category ids dynamically from categorization config
+out_file="$out_dir/ip_category_table.csv"
+echo "# Generated at $(date -u '+%Y-%m-%d %H:%M UTC')" > "$out_file.tmp"
+echo "# IP,botnet_drone,bruteforce,cc,ddos,ddos-amplifier,exploit,malware_distribution,phishing_site,scan,spam" >> "$out_file.tmp"
+mongosh nerd --quiet --eval '
+function int2ip (ipInt) {
+  return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
+}
+db.ip.aggregate([
+  { $project: { _id: 1, _threat_category_summary: 1 } },
+  { $set: { categories: { $arrayToObject: { $map: { input: "$_threat_category_summary", as: "cat", in: { k: "$$cat.c", v: { $toString: "$$cat.conf" } } } } } } },
+  { $replaceWith: { $mergeObjects: ["$$ROOT", "$categories"] } },
+  { $project: {
+    "_id": 1,
+    "botnet_drone": { $ifNull: ["$botnet_drone", "0"] },
+    "bruteforce": { $ifNull: ["$bruteforce", "0"] },
+    "cc": { $ifNull: ["$cc", "0"] },
+    "ddos": { $ifNull: ["$ddos", "0"] },
+    "ddos-amplifier": { $ifNull: ["$ddos-amplifier", "0"] },
+    "exploit": { $ifNull: ["$exploit", "0"] },
+    "malware_distribution": { $ifNull: ["$malware_distribution", "0"] },
+    "phishing_site": { $ifNull: ["$phishing_site", "0"] },
+    "scan": { $ifNull: ["$scan", "0"] },
+    "spam": { $ifNull: ["$spam", "0"] }
+  }}
+]).forEach(function(rec) {
+  var categories = Object.keys(rec).filter(key => key !== "_id");
+  print(
+    int2ip(rec._id) + "," +
+    categories.map(key => rec[key]).join(",")
+  );
+})' | grep -v "^$" | sort -n >> "$out_file.tmp"
+mv "$out_file"{.tmp,}

From ff49a6562ee2308dbbe297ba300b757f979da8df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 25 Apr 2024 08:23:12 +0200
Subject: [PATCH 14/21] threat_categorization: improved classification rules

---
 common/threat_categorization.py |   5 +-
 etc/threat_categorization.yml   | 142 ++++++++++++++++----------------
 2 files changed, 75 insertions(+), 72 deletions(-)

diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index 01d85167..2c5972e6 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -1,5 +1,6 @@
 import yaml
 import ast
+import re
 from datetime import datetime
 
 from .utils import parse_rfc_time
@@ -41,8 +42,10 @@ def init_warden_receiver(self, event, source):
         detect_time = parse_rfc_time(event["DetectTime"])
         self.date = detect_time.strftime("%Y-%m-%d")
         self.categories = event.get('Category', [])
-        self.ip_info = ";".join(source.get('Type', []))
         self.description = event.get("Description", "")
+        self.note = event.get("Note", "")
+        self.ip_info = source.get('Note', "")
+        self.source_types = source.get('Type', [])
         target_ports = []
         protocols = source.get('Proto', [])
         for target in event.get('Target', []):
diff --git a/etc/threat_categorization.yml b/etc/threat_categorization.yml
index 29455c1a..16d66268 100644
--- a/etc/threat_categorization.yml
+++ b/etc/threat_categorization.yml
@@ -55,16 +55,20 @@ threat_categorization:
       - port
     triggers:
       general: |-
-        match_str('scan', event.ip_info)
+        bool(re.findall(r'(?i)\b(scan(ning)|(ners?)?)\b', event.ip_info + event.description))
       warden_receiver: |-
-        any([match_str('Recon', cat) for cat in event.categories])
+        'Recon.Scanning' in event.categories
       otx_receiver: |-
-        match_str('scan', event.indicator_role)
-        event.description == 'Webscanners 2018-02-09 thru current day'
+        bool(re.findall(r'(?i)\b(scan(ning)|(ners?)?)\b', event.indicator_role))
       misp_receiver: |-
-        'CERT-XLM:information-gathering="scanner"' in event.tags
-        'ecsirt:information-gathering="scanner"' in event.tags
-        'circl:incident-classification="scan"' in event.tags
+        any([bool(re.findall(r'(?i)\b(scan(ning)|(ners?)?)\b', tag)) for tag in event.tags])
+      blacklists: |-
+        event.ip_info == 'crowdsecurity/iptables-scan-multi_ports'
+        event.ip_info == 'crowdsecurity/http-crawl-non_statics'
+        event.ip_info == 'crowdsecurity/http-path-traversal-probing'
+        event.ip_info == 'crowdsecurity/http-admin-interface-probing'
+        event.ip_info == 'crowdsecurity/http-probing'
+        event.ip_info == 'crowdsecurity/http-sensitive-files'
 
   bruteforce:
     role: src
@@ -75,26 +79,25 @@ threat_categorization:
       - port
     triggers:
       general: |-
-        match_str('SSH login', (event.ip_info + event.description)) -> {'protocol': ['ssh']}
-        match_str('SSH intrusion', (event.ip_info + event.description)) -> {'protocol': ['ssh']}
-        match_str('SSH honeypot', (event.ip_info + event.description)) -> {'protocol': ['ssh']}
-        match_str('RDP honeypot', (event.ip_info + event.description)) -> {'protocol': ['rdp']}
-        match_str('Telnet login', (event.ip_info + event.description)) -> {'protocol': ['telnet']}
-        match_str('Telnet honeypot', (event.ip_info + event.description)) -> {'protocol': ['telnet']}
-        match_str('bruteforce', event.ip_info)
+        bool(re.findall(r'(?i)ssh.*(brute[\s_-]?force|login|intrusion|honeypot)', event.ip_info + event.description)) -> {'protocol': ['ssh']}
+        bool(re.findall(r'(?i)rdp.*(brute[\s_-]?force|login|intrusion|honeypot)', event.ip_info + event.description)) -> {'protocol': ['rdp']}
+        bool(re.findall(r'(?i)telnet.*(brute[\s_-]?force|login|intrusion|honeypot)', event.ip_info + event.description)) -> {'protocol': ['telnet']}
+        bool(re.findall(r'(?i)vnc.*(brute[\s_-]?force|login|intrusion|honeypot)', event.ip_info + event.description)) -> {'protocol': ['vnc']}
+        bool(re.findall(r'(?i)redis.*(brute[\s_-]?force|login|intrusion|honeypot)', event.ip_info + event.description)) -> {'protocol': ['redis']}
+        bool(re.findall(r'(?i)postgresql.*(brute[\s_-]?force|login|intrusion|honeypot)', event.ip_info + event.description)) -> {'protocol': ['postgresql']}
       warden_receiver: |-
         'Attempt.Login' in event.categories
         'Intrusion.UserCompromise' in event.categories
         'Intrusion.AdminCompromise' in event.categories
       otx_receiver: |-
-        match_str('bruteforce', event.indicator_role)
-        'VNC honeypot logs' in event.description
-        'Redis honeypot logs' in event.description
-        'PostgresQL honeypot logs' in event.description
+        bool(re.findall(r'(?i)brute[\s_-]?force', event.indicator_role))
       misp_receiver: |-
-        'CERT-XLM:intrusion-attempts="login-attempts"' in event.tags
-        'ecsirt:intrusion-attempts="brute-force"' in event.tags
+        any([bool(re.findall(r'(?i)login.*attempt', tag)) for tag in event.tags])
+        any([bool(re.findall(r'(?i)brute[\s_-]?force', tag)) for tag in event.tags])
       blacklists: |-
+        bool(re.findall(r'(?i)brute[\s_-]?force.*ssh', event.ip_info)) -> {'protocol': ['ssh']}
+        bool(re.findall(r'(?i)brute[\s_-]?force.*ftp', event.ip_info)) -> {'protocol': ['ftp']}
+        bool(re.findall(r'(?i)brute[\s_-]?force.*sip', event.ip_info)) -> {'protocol': ['sip']}
         event.description == 'blocklist_de-ssh' -> {'protocol': ['ssh']}
         event.description == 'charles_the_haleys_ssh_dico_ips' -> {'protocol': ['ssh']}
         event.description == 'charles_the_haleys_smtp_dico_ips' -> {'protocol': ['smtp']}
@@ -103,9 +106,8 @@ threat_categorization:
         event.description == 'dataplane_org_telnet_login' -> {'protocol': ['telnet']}
         event.description == 'bruteforceblocker'
         event.description == 'blocklist_de-bruteforcelogin'
-        match_str('Brute force passwords using SSH', event.ip_info) -> {'protocol': ['ssh']}
-        match_str('Brute force passwords using FTP', event.ip_info) -> {'protocol': ['ftp']}
-        match_str('Brute force passwords to SIP', event.ip_info) -> {'protocol': ['sip']}
+        event.ip_info == 'crowdsecurity/http-generic-bf' -> {'protocol': ['http']}
+        event.ip_info.startswith('crowdsecurity/ssh-') -> {'protocol': ['ssh']}
 
   ddos:
     role: src
@@ -113,19 +115,14 @@ threat_categorization:
     label: DDoS
     triggers:
       warden_receiver: |-
-        'DoS anomalies' in event.description
         'Availability.DoS' in event.categories
         'Availability.DDoS' in event.categories
       misp_receiver: |-
-        'DDoS' in event.tags
-        'CERT-XLM:availability="dos"' in event.tags
-        'CERT-XLM:availability="ddos"' in event.tags
-        'ecsirt:availability="dos"' in event.tags
-        'ecsirt:availability="ddos"' in event.tags
-        'circl:incident-classification="denial-of-service"' in event.tags
+        any([bool(re.findall(r'(?i)d?dos', tag)) for tag in event.tags])
+        any([bool(re.findall(r'(?i)\bdenial[\s_-]of[\s_-]service\b', tag)) for tag in event.tags])
       blacklists: |-
-        match_str('HTTP flood', event.ip_info) -> {'protocol': ['http']}
-        match_str('DDoS', event.ip_info)
+        bool(re.findall(r'(?i)http[\s_-]flood', event.ip_info)) -> {'protocol': ['http']}
+        bool(re.findall(r'(?i)d?dos', event.ip_info))
 
   ddos-amplifier:
     role: dst
@@ -135,10 +132,11 @@ threat_categorization:
       - protocol
     triggers:
       general: |-
-        match_str('Open DNS', (event.ip_info + event.description)) -> {'protocol': ['dns']}
-        match_str('Open Memcached', (event.ip_info + event.description)) -> {'protocol': ['memcached']}
-        match_str('Abusable NTP', (event.ip_info + event.description)) -> {'protocol': ['ntp']}
+        bool(re.findall(r'(?i)(Open|Abusable)[\s_-]DNS', event.ip_info + event.description)) -> {'protocol': ['dns']}
+        bool(re.findall(r'(?i)(Open|Abusable)[\s_-]Memcached', event.ip_info + event.description)) -> {'protocol': ['memcached']}
+        bool(re.findall(r'(?i)(Open|Abusable)[\s_-]NTP', event.ip_info + event.description)) -> {'protocol': ['ntp']}
       warden_receiver: |-
+        ('Availability.DoS' in event.categories or 'Availability.DDoS' in event.categories) and bool(re.findall(r'(?i)dns.*amplification', event.description + event.note)) -> {'protocol': ['dns']}
         'Vulnerable.Config' in event.categories and 'dns' in event.protocols -> {'protocol': ['dns']}
         'Vulnerable.Config' in event.categories and 'ntp' in event.protocols -> {'protocol': ['ntp']}
         'Vulnerable.Config' in event.categories and 'memcached' in event.protocols -> {'protocol': ['memcached']}
@@ -152,18 +150,18 @@ threat_categorization:
     label: Spam
     triggers:
       general: |-
-        match_str('spam', event.ip_info)
+        bool(re.findall(r'(?i)\bspam(ming)?\b', event.ip_info + event.description))
       warden_receiver: |-
         'Abusive.Spam' in event.categories
+        any([type == 'Spam' for type in event.source_types])
       misp_receiver: |-
-        'CERT-XLM:abusive-content="spam"' in event.tags
-        'ecsirt:abusive-content="spam"' in event.tags
-        'circl:incident-classification="spam"' in event.tags
+        any([bool(re.findall(r'(?i)spam', tag)) for tag in event.tags])
       blacklists: |-
         event.description == 'sblam_ips'
         event.description == 'psbl'
         event.description == 'spamhaus_edrop'
-        match_str('Send spam', event.ip_info)
+        bool(re.findall(r'(?i)send.*spam', event.ip_info))
+        event.ip_info == 'crowdsecurity/postfix-spam'
 
   malware_distribution:
     role: dst
@@ -173,21 +171,15 @@ threat_categorization:
       - malware_family
     triggers:
       general: |-
-        match_str('malware', event.ip_info)
-        match_str('trojan', event.ip_info)
-        match_str('ransomware', event.ip_info)
-        match_str('payload delivery', event.ip_info)
+        bool(re.findall(r'(?i)malware.*host', event.ip_info + event.description))
+        bool(re.findall(r'(?i)malware.*download', event.ip_info + event.description))
       warden_receiver: |-
-        any([match_str('Malware', cat) for cat in event.categories])
+        any([type == 'Malware' for type in event.source_types])
       otx_receiver: |-
-        match_str('malware', event.indicator_role)
-        match_str('trojan', event.indicator_role)
+        bool(re.findall(r'(?i)malware.*host', event.indicator_role))
+        bool(re.findall(r'(?i)malware.*download', event.indicator_role))
       misp_receiver: |-
-        any([match_str('malware', tag) for tag in event.tags])
-        any([match_str('ransomware', tag) for tag in event.tags])
-        any([match_str('trojan', tag) for tag in event.tags])
-        'circl:incident-classification="malware"' in event.tags
-        'ecsirt:malicious-code="malware"' in event.tags
+        any([bool(re.findall(r'(?i)(malware|trojan|ransomware)', tag)) for tag in event.tags]) and event.ip_role == "dst"
       blacklists: |-
         event.description == 'urlhouse_ips'
 
@@ -199,19 +191,22 @@ threat_categorization:
       - malware_family
     triggers:
       general: |-
-        match_str('command and control', event.ip_info)
-        match_str('botnet cc', event.ip_info)
-        match_str('c2 server', event.ip_info)
+        bool(re.findall(r'(?i)command.*control', event.ip_info + event.description))
+        bool(re.findall(r'(?i)botnet[\s_-]cc', event.ip_info + event.description))
+        bool(re.findall(r'(?i)c2[\s_-]server', event.ip_info + event.description))
+        bool(re.findall(r'(?i)c&?c[\s_-]server', event.ip_info + event.description))
       warden_receiver: |-
-        'CC' in event.ip_info
+        any([type == 'CC' for type in event.source_types])
       otx_receiver: |-
-        match_str('command and control', event.indicator_role)
+        bool(re.findall(r'(?i)command.*control', event.indicator_role))
+        bool(re.findall(r'(?i)c2', event.indicator_role))
+        bool(re.findall(r'(?i)c&?c', event.indicator_role))
       misp_receiver: |-
-        'C2' in event.tags
-        'kill-chain:Command and Control' in event.tags
-        'ecsirt:malicious-code="c&c"' in event.tags
+        any([bool(re.findall(r'(?i)command.*control', tag)) for tag in event.tags])
+        any([bool(re.findall(r'(?i)c2', tag)) for tag in event.tags])
+        any([bool(re.findall(r'(?i)c&c', tag)) for tag in event.tags])
       blacklists: |-
-        event.description == 'feodo'
+        event.description == 'feodo' -> {'malware_family': ['win.feodo']}
         event.description == 'bambenek_c2'
 
   botnet_drone:
@@ -221,12 +216,14 @@ threat_categorization:
     subcategories:
       - malware_family
     triggers:
+      general: |-
+        bool(re.findall(r'(?i)botnet.*drone', event.ip_info + event.description))
+        bool(re.findall(r'(?i)botnet.*member', event.ip_info + event.description))
       warden_receiver: |-
         'Intrusion.Botnet' in event.categories
-        'Botnet' in event.ip_info
+        any([type == 'Botnet' for type in event.source_types])
       misp_receiver: |-
-        'CERT-XLM:intrusion="botnet-member"' in event.tags
-        'ecsirt:malicious-code="botnet-drone"' in event.tags
+        any([bool(re.findall(r'(?i)botnet', tag)) for tag in event.tags])
       blacklists: |-
         event.description == 'mirai_tracker_ips' -> {'malware_family': ['elf.mirai']}
 
@@ -236,11 +233,12 @@ threat_categorization:
     label: Phishing site
     triggers:
       general: |-
-        match_str('phishing', event.ip_info)
+        bool(re.findall(r'(?i)phishing.*site', event.ip_info + event.description))
       warden_receiver: |-
-        'Fraud.Phishing' in event.categories
+        any([type == 'Phishing' for type in event.source_types])
       misp_receiver: |-
-        any([match_str('phishing', tag) for tag in event.tags])
+        any([bool(re.findall(r'(?i)phishing.*site', tag)) for tag in event.tags])
+        any([bool(re.findall(r'(?i)phishing', tag)) for tag in event.tags]) and event.ip_role == "dst"
       blacklists: |-
         event.description == 'openphish'
 
@@ -252,14 +250,16 @@ threat_categorization:
       - protocol
     triggers:
       general: |-
-        match_str('exploit', event.ip_info)
+        bool(re.findall(r'(?i)attempt(ing)?.*exploit', event.ip_info + event.description))
       warden_receiver: |-
         'Attempt.Exploit' in event.categories
       otx_receiver: |-
         'Apache honeypot logs' in event.description -> {'protocol': ['http']}
-        match_str('exploit', event.indicator_role)
+        bool(re.findall(r'(?i)exploit', event.indicator_role))
       misp_receiver: |-
-        any([match_str('exploit', tag) for tag in event.tags])
+        any([bool(re.findall(r'(?i)exploit', tag)) for tag in event.tags])
         'CERT-XLM:intrusion-attempts="new-attack-signature"' in event.tags
         'circl:incident-classification="XSS"' in event.tags
-        'circl:incident-classification="sql-injection"' in event.tags
\ No newline at end of file
+        'circl:incident-classification="sql-injection"' in event.tags
+      blacklists: |-
+        bool(re.findall(r'(?i)CVE[-_]20', event.ip_info))
\ No newline at end of file

From 15081e4e49a66b1deb5806df2e6287c328c6ed40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Wed, 8 May 2024 15:13:16 +0200
Subject: [PATCH 15/21] threat_cagegorization.yml: minor changes in
 classification rules

---
 etc/threat_categorization.yml | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/etc/threat_categorization.yml b/etc/threat_categorization.yml
index 16d66268..c707a354 100644
--- a/etc/threat_categorization.yml
+++ b/etc/threat_categorization.yml
@@ -55,13 +55,13 @@ threat_categorization:
       - port
     triggers:
       general: |-
-        bool(re.findall(r'(?i)\b(scan(ning)|(ners?)?)\b', event.ip_info + event.description))
+        bool(re.findall(r'(?i)scanning|scanner|probing', event.ip_info + event.description))
       warden_receiver: |-
         'Recon.Scanning' in event.categories
       otx_receiver: |-
-        bool(re.findall(r'(?i)\b(scan(ning)|(ners?)?)\b', event.indicator_role))
+        bool(re.findall(r'(?i)scanning|scanner|probing', event.indicator_role))
       misp_receiver: |-
-        any([bool(re.findall(r'(?i)\b(scan(ning)|(ners?)?)\b', tag)) for tag in event.tags])
+        any([bool(re.findall(r'(?i)scanning|scanner|probing', tag)) for tag in event.tags])
       blacklists: |-
         event.ip_info == 'crowdsecurity/iptables-scan-multi_ports'
         event.ip_info == 'crowdsecurity/http-crawl-non_statics'
@@ -114,14 +114,19 @@ threat_categorization:
     description: The IP has been observed as a source of volumetric (D)DoS attacks.
     label: DDoS
     triggers:
+      general: |-
+        bool(re.findall(r'(?i)http.*flood', event.ip_info + event.description)) -> {'protocol': ['http']}
+        bool(re.findall(r'(?i)dns.*flood', event.ip_info + event.description)) -> {'protocol': ['dns']}
+        bool(re.findall(r'(?i)udp.*flood', event.ip_info + event.description)) -> {'protocol': ['udp']}
+        bool(re.findall(r'(?i)(ping|icmp).*flood', event.ip_info + event.description)) -> {'protocol': ['icmp']}
+        bool(re.findall(r'(?i)syn.*flood', event.ip_info + event.description)) -> {'protocol': ['tcp']}
       warden_receiver: |-
         'Availability.DoS' in event.categories
         'Availability.DDoS' in event.categories
       misp_receiver: |-
         any([bool(re.findall(r'(?i)d?dos', tag)) for tag in event.tags])
-        any([bool(re.findall(r'(?i)\bdenial[\s_-]of[\s_-]service\b', tag)) for tag in event.tags])
+        any([bool(re.findall(r'(?i)denial[\s_-]of[\s_-]service', tag)) for tag in event.tags])
       blacklists: |-
-        bool(re.findall(r'(?i)http[\s_-]flood', event.ip_info)) -> {'protocol': ['http']}
         bool(re.findall(r'(?i)d?dos', event.ip_info))
 
   ddos-amplifier:
@@ -150,7 +155,7 @@ threat_categorization:
     label: Spam
     triggers:
       general: |-
-        bool(re.findall(r'(?i)\bspam(ming)?\b', event.ip_info + event.description))
+        bool(re.findall(r'(?i)send.*spam', event.ip_info + event.description))
       warden_receiver: |-
         'Abusive.Spam' in event.categories
         any([type == 'Spam' for type in event.source_types])
@@ -250,7 +255,7 @@ threat_categorization:
       - protocol
     triggers:
       general: |-
-        bool(re.findall(r'(?i)attempt(ing)?.*exploit', event.ip_info + event.description))
+        bool(re.findall(r'(?i)attempt.*exploit', event.ip_info + event.description))
       warden_receiver: |-
         'Attempt.Exploit' in event.categories
       otx_receiver: |-
@@ -262,4 +267,7 @@ threat_categorization:
         'circl:incident-classification="XSS"' in event.tags
         'circl:incident-classification="sql-injection"' in event.tags
       blacklists: |-
-        bool(re.findall(r'(?i)CVE[-_]20', event.ip_info))
\ No newline at end of file
+        bool(re.findall(r'(?i)CVE[-_]20', event.ip_info))
+        event.ip_info == 'http-sqli-probing' -> {'protocol': ['http']}
+        event.ip_info == 'http-xss-probing' -> {'protocol': ['http']}
+        event.ip_info == 'http-backdoors-attempts' -> {'protocol': ['http']}
\ No newline at end of file

From 9e6cbca02dbd1d835e7593a610b0ccaead3f867c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 16 May 2024 08:15:28 +0200
Subject: [PATCH 16/21] scripts: rewritten scripts for generating downloadable
 files with IP categories

---
 install/cron/nerd                       |   5 +-
 scripts/generate_category_blocklists.sh |  51 ----------
 scripts/generate_ip_category_files.py   | 119 ++++++++++++++++++++++++
 scripts/generate_ip_category_list.sh    |  61 ------------
 4 files changed, 121 insertions(+), 115 deletions(-)
 delete mode 100644 scripts/generate_category_blocklists.sh
 create mode 100644 scripts/generate_ip_category_files.py
 delete mode 100644 scripts/generate_ip_category_list.sh

diff --git a/install/cron/nerd b/install/cron/nerd
index 3302636c..9952e923 100644
--- a/install/cron/nerd
+++ b/install/cron/nerd
@@ -11,9 +11,8 @@
 00 * * * * nerd /nerd/scripts/generate_blocklist.sh 0.5 | sort -n > /data/web_data/bad_ips.txt.tmp && mv /data/web_data/bad_ips.txt{.tmp,}
 00 * * * * nerd /nerd/scripts/generate_blocklist.sh 0.2 | sort -n > /data/web_data/bad_ips_med_conf.txt.tmp && mv /data/web_data/bad_ips_med_conf.txt{.tmp,}
 
-# Generate list of IPs and threat categories every hour
-00 * * * * nerd /nerd/scripts/generate_ip_category_list.sh /data/web_data
-00 * * * * nerd /nerd/scripts/generate_category_blocklists.sh 0.5 /data/web_data
+# Generate lists of IPs and their categories every hour
+00 * * * * nerd python3 /nerd/scripts/generate_ip_category_files.py --threshold 0.5 --output /data/web_data
 
 # Remove old IDEA messages from PostgreSQL every day at 03:00
 # (enable if local PSQL is used to store alerts from Warden)
diff --git a/scripts/generate_category_blocklists.sh b/scripts/generate_category_blocklists.sh
deleted file mode 100644
index f8be84a7..00000000
--- a/scripts/generate_category_blocklists.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Generate blocklists for each threat category (IPs with confidence higher than a given threshold)
-# Intended use is to generate a blocklist/TI feed for download.
-#
-# Takes two parameters - confidence threshold (default = 0.5) and output directory
-#
-# Output a plain-text file with one IP per line (and a comment in the beginning)
-
-if [[ -z "$1" ]]; then
-  thr=0.5 # no/emtpy parameter, use default
-elif [[ "$1" =~ ^0[.][0-9]+$ ]]; then
-  thr="$1"
-else
-  echo "ERROR" # This will be content of the generated blocklist
-  echo "ERROR: Threshold must be a number between 0 and 1" >&2 # Error message to stderr
-  exit 1
-fi
-
-out_dir="${2%/}"
-if [[ -z "$out_dir" ]]; then
-  out_dir=/data/web-data # no/emtpy parameter, use default
-elif [ ! -d "$out_dir" ]; then
-  # Create the output directory if it doesn't exist
-  mkdir -p "$out_dir"
-fi
-
-# List of category IDs
-# TODO load category ids dynamically from categorization config
-declare -a categories=(
-  "botnet_drone"
-  "bruteforce"
-  "cc"
-  "ddos"
-  "ddos-amplifier"
-  "exploit"
-  "malware_distribution"
-  "phishing_site"
-  "scan"
-  "spam"
-)
-
-for category in "${categories[@]}"; do
-  echo "# Generated at $(date -u '+%Y-%m-%d %H:%M UTC')" > "$out_dir/bl_$category.txt"
-  mongosh nerd --quiet --eval '
-    function int2ip (ipInt) {
-      return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
-  }
-  db.ip.find({"_threat_category_summary": {$elemMatch: {"c": "'$category'", "conf": {$gt: '$thr'}}}, "tags.whitelist": {$exists: false}}, {_id: 1}).sort({"_threat_category_summary.conf": -1}).forEach( function(rec) { print(int2ip(rec._id)); } );
-  ' | grep -v "^$" | sort -n >> "$out_dir/bl_$category.txt.tmp"
-  mv "$out_dir/bl_$category.txt"{.tmp,}
-done
diff --git a/scripts/generate_ip_category_files.py b/scripts/generate_ip_category_files.py
new file mode 100644
index 00000000..57e39361
--- /dev/null
+++ b/scripts/generate_ip_category_files.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""
+Generate list of all IPs in NERD's database with their categories
+Parameters - path to config directory
+           - path to output directory
+           - blacklist confidence threshold
+"""
+import os
+import sys
+import subprocess
+import argparse
+
+# Add to path the "one directory above the current file location" to find modules from "common"
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))
+
+from common.config import read_config
+
+
+# parse arguments
+parser = argparse.ArgumentParser(
+    prog="generate_ip_category_files.py",
+    description="Generate list of all IPs in NERD's database with their categories"
+)
+parser.add_argument("-c", '--config', dest='cfg_file', default="/etc/nerd/threat_categorization.yml",
+                    help="Path to configuration file (default: /etc/nerd/threat_categorization.yml)")
+parser.add_argument("-o", '--output', dest='out_dir', default="/data/web_data/",
+                    help="Path to output directory (default: /data/web_data/)")
+parser.add_argument("-t", '--threshold', dest='conf_thr', default="0.5",
+                    help="Blacklist confidence threshold (default: 0.5)")
+parser.add_argument("-v", '--verbose', dest="verbose", action="store_true", help="Verbose mode")
+args = parser.parse_args()
+
+# read categorization config
+config = read_config(args.cfg_file)
+categories = [cat for cat in config.get('threat_categorization')]
+categories.remove("unknown")
+
+# bash script used to execute the DB query
+script = """
+    echo \"# generated at $(date -u '+%Y-%m-%d %H:%M UTC')\" > {out_file}.tmp &&
+    echo \"# {header}\" >> {out_file}.tmp &&
+    mongosh nerd --quiet --eval '{query}' | grep -v \"^$\" | sort -n >> {out_file}.tmp &&
+    mv {out_file}{{.tmp,}}
+"""
+
+def fstr(template):
+    return eval(f"f'''{template}'''")
+
+########################################################################################################################
+
+# full list - line format (ip,category,confidence)
+if args.verbose:
+    print("Generating full IP list (line format)")
+
+out_file = f"{args.out_dir}/ip_category.csv"
+header = "ip,category,confidence"
+query = '''
+function int2ip (ipInt) {
+  return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
+}
+db.ip.aggregate([
+  { $unwind: "$_threat_category_summary" },
+  { $set: { category: "$_threat_category_summary.c", confidence: { $toString: "$_threat_category_summary.conf" } } },
+  { $project: { _id: 1, category: 1, confidence: 1 } }
+]).forEach(function(rec) { print(int2ip(rec._id) + "," + rec.category + "," + rec.confidence); })
+'''
+subprocess.run(fstr(script), shell=True)
+
+########################################################################################################################
+
+# full list - table format (ip,conf_scan,conf_bruteforce,...)
+if args.verbose:
+    print("Generating full IP list (table format)")
+
+out_file = f"{args.out_dir}/ip_category_table.csv"
+header = f"# ip,conf_{',conf_'.join(categories)}"
+query = '''
+function int2ip (ipInt) {
+  return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
+}
+db.ip.aggregate([
+  { $project: { _id: 1, _threat_category_summary: 1 } },
+  { $set: { categories: { $arrayToObject: { $map: { input: "$_threat_category_summary", as: "cat", in: { k: "$$cat.c", v: { $toString: "$$cat.conf" } } } } } } },
+  { $replaceWith: { $mergeObjects: ["$$ROOT", "$categories"] } },
+  { $project: {
+    "_id": 1,
+'''
+for category in categories:
+    query += f'"{category}": {{ $ifNull: ["${category}", "0"] }},\n'
+query += '''
+}}]).forEach(function(rec) {
+  var categories = Object.keys(rec).filter(key => key !== "_id");
+  print(
+    int2ip(rec._id) + "," +
+    categories.map(key => rec[key]).join(",")
+  );
+})
+'''
+subprocess.run(fstr(script), shell=True)
+
+########################################################################################################################
+
+# blacklists
+if args.verbose:
+    print("Generating blacklists")
+
+for category in categories:
+    out_file = f"{args.out_dir}/bl_{category}.txt"
+    header = ""
+    query = f'''
+    function int2ip (ipInt) {{
+      return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
+    }}
+    db.ip.find({{"_threat_category_summary": {{$elemMatch: {{"c": "{category}", "conf": {{$gt: {args.conf_thr}}}}}}}, "tags.whitelist": {{$exists: false}}}}, {{_id: 1}}).sort({{"_threat_category_summary.conf": -1}}).forEach( function(rec) {{ print(int2ip(rec._id)); }} );
+    '''
+    subprocess.run(fstr(script), shell=True)
+
+if args.verbose:
+    print("Done")
diff --git a/scripts/generate_ip_category_list.sh b/scripts/generate_ip_category_list.sh
deleted file mode 100644
index bf8fccf1..00000000
--- a/scripts/generate_ip_category_list.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/sh
-# Generate list of all IPs in NERD's database with their categories
-# Takes one parameter - output directory
-
-out_dir="${1%/}"
-if [[ -z "$out_dir" ]]; then
-  out_dir=/data/web-data # no/emtpy parameter, use default
-elif [ ! -d "$out_dir" ]; then
-  # Create the output directory if it doesn't exist
-  mkdir -p "$out_dir"
-fi
-
-# Line format
-out_file="$out_dir/ip_category.csv"
-echo "# Generated at $(date -u '+%Y-%m-%d %H:%M UTC')" > "$out_file.tmp"
-echo "# IP,Category,Confidence" >> "$out_file.tmp"
-mongosh nerd --quiet --eval '
-function int2ip (ipInt) {
-  return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
-}
-db.ip.aggregate([
-  { $unwind: "$_threat_category_summary" },
-  { $set: { category: "$_threat_category_summary.c", confidence: { $toString: "$_threat_category_summary.conf" } } },
-  { $project: { _id: 1, category: 1, confidence: 1 } }
-]).forEach(function(rec) { print(int2ip(rec._id) + "," + rec.category + "," + rec.confidence); })' | grep -v "^$" | sort -n >> "$out_file.tmp"
-mv "$out_file"{.tmp,}
-
-# Table format
-# TODO load category ids dynamically from categorization config
-out_file="$out_dir/ip_category_table.csv"
-echo "# Generated at $(date -u '+%Y-%m-%d %H:%M UTC')" > "$out_file.tmp"
-echo "# IP,botnet_drone,bruteforce,cc,ddos,ddos-amplifier,exploit,malware_distribution,phishing_site,scan,spam" >> "$out_file.tmp"
-mongosh nerd --quiet --eval '
-function int2ip (ipInt) {
-  return ( (ipInt>>>24) + "." + (ipInt>>16 & 255) + "." + (ipInt>>8 & 255) + "." + (ipInt & 255) );
-}
-db.ip.aggregate([
-  { $project: { _id: 1, _threat_category_summary: 1 } },
-  { $set: { categories: { $arrayToObject: { $map: { input: "$_threat_category_summary", as: "cat", in: { k: "$$cat.c", v: { $toString: "$$cat.conf" } } } } } } },
-  { $replaceWith: { $mergeObjects: ["$$ROOT", "$categories"] } },
-  { $project: {
-    "_id": 1,
-    "botnet_drone": { $ifNull: ["$botnet_drone", "0"] },
-    "bruteforce": { $ifNull: ["$bruteforce", "0"] },
-    "cc": { $ifNull: ["$cc", "0"] },
-    "ddos": { $ifNull: ["$ddos", "0"] },
-    "ddos-amplifier": { $ifNull: ["$ddos-amplifier", "0"] },
-    "exploit": { $ifNull: ["$exploit", "0"] },
-    "malware_distribution": { $ifNull: ["$malware_distribution", "0"] },
-    "phishing_site": { $ifNull: ["$phishing_site", "0"] },
-    "scan": { $ifNull: ["$scan", "0"] },
-    "spam": { $ifNull: ["$spam", "0"] }
-  }}
-]).forEach(function(rec) {
-  var categories = Object.keys(rec).filter(key => key !== "_id");
-  print(
-    int2ip(rec._id) + "," +
-    categories.map(key => rec[key]).join(",")
-  );
-})' | grep -v "^$" | sort -n >> "$out_file.tmp"
-mv "$out_file"{.tmp,}

From 682cae87f15b8fb1ad34a75beb0445092069bdbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Thu, 30 May 2024 07:57:48 +0200
Subject: [PATCH 17/21] threat_categorization.py: bugfix in blacklists init
 function

---
 common/threat_categorization.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index 2c5972e6..115848fd 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -103,7 +103,6 @@ def init_blacklists(self, blacklist_id, ip_info, download_time):
         self.date = download_time.strftime("%Y-%m-%d")
         self.description = blacklist_id
         self.ip_info = str(ip_info)
-        self.description = ""
         self.protocols = []
         self.target_ports = []
 

From 9aebb1d629c8e00f94726b7d8d493826a1678bc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=A1clav=20Barto=C5=A1?= <bartos@cesnet.cz>
Date: Thu, 5 Sep 2024 11:12:06 +0200
Subject: [PATCH 18/21] config: Improved descriptions of some blacklists

and fixed the name of URLHaus (was URLhouse)
---
 etc/primary_blacklists.yml    | 38 +++++++++++++----------------------
 etc/threat_categorization.yml |  2 +-
 2 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/etc/primary_blacklists.yml b/etc/primary_blacklists.yml
index 1978970f..cde419dd 100644
--- a/etc/primary_blacklists.yml
+++ b/etc/primary_blacklists.yml
@@ -182,7 +182,7 @@ iplists:
   firehol_link: http://iplists.firehol.org/?ipset=blocklist_net_ua
   provider_link: https://blocklist.net.ua/about/
   url: https://blocklist.net.ua/blocklist.csv
-  regex: "^(\\A);.*;.*;(.*)"
+  regex: "^(\\A)"
   # The time of updating the list was viewed here: https://blocklist.net.ua/providers/  
   time:
     hour: "*/4"
@@ -272,7 +272,7 @@ iplists:
   descr: ThreatFox is a free platform from abuse.ch with the goal of<br>sharing indicators of compromise (IOCs) associated with malware with the<br>infosec community, AV vendors and threat intelligence providers.
   provider_link: https://threatfox.abuse.ch/
   url: https://threatfox.abuse.ch/export/csv/ip-port/recent/
-  regex: ".*?(\\A),(.*)"
+  regex: ".*?(\\A)"
   # The time of updating the list was viewed here: https://threatfox.abuse.ch/export/
   time:
     hour: "*/4"
@@ -280,7 +280,7 @@ iplists:
   
 - id: ssl_blacklist_ips
   name: SSL Blacklist
-  descr: The SSL Blacklist is a project of abuse.ch with the goal of<br>detecting malicious SSL connections and botnet activities.
+  descr: The SSL Blacklist is a project of abuse.ch with the goal of<br>detecting SSL certificates used by botnet C&C servers. NERD uses its list of C&C servers using such certificates.
   firehol_link: http://iplists.firehol.org/?ipset=sslbl
   provider_link: https://sslbl.abuse.ch/
   url: https://sslbl.abuse.ch/blacklist/sslipblacklist.txt
@@ -290,8 +290,8 @@ iplists:
     hour: 19
     minute: 30
   
-- id: urlhouse_ips
-  name: URLHouse
+- id: urlhaus_ips
+  name: URLHaus
   descr: URLhaus is a project from abuse.ch with the goal of sharing<br>malicious URLs that are being used for malware distribution.<br>This list contains IPs that are part of the malicious ULRs.
   provider_link: https://urlhaus.abuse.ch/
   url: https://urlhaus.abuse.ch/downloads/text_online/
@@ -380,7 +380,7 @@ iplists:
   
 - id: dataplane_org_sshclient
   name: DataPlane SSH conn
-  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that<br>has been seen initiating an SSH connection to a remote host.
+  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that<br>has been seen initiating an unsolicited SSH connection to a remote host.
   firehol_link: http://iplists.firehol.org/?ipset=dataplane_sshclient
   provider_link: https://dataplane.org/
   url: https://dataplane.org/sshclient.txt
@@ -392,7 +392,7 @@ iplists:
   
 - id: dataplane_org_sshpwauth
   name: DataPlane SSH login
-  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IPs performing<br>login to a host using SSH password authentication.
+  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IPs trying<br>an unsolicited login to a host using SSH password authentication.
   firehol_link: http://iplists.firehol.org/?ipset=dataplane_sshpwauth
   provider_link: https://dataplane.org/
   url: https://dataplane.org/sshpwauth.txt
@@ -404,7 +404,7 @@ iplists:
   
 - id: dataplane_org_vnc_rfb
   name: DataPlane VNC RFB
-  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IPs initiating<br>a VNC remote frame buffer (RFB) session to a remote host.
+  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IPs initiating<br>an unsolicited VNC remote frame buffer (RFB) session to a remote host.
   firehol_link: http://iplists.firehol.org/?ipset=dataplane_vncrfb
   provider_link: https://dataplane.org/
   url: https://dataplane.org/vncrfb.txt
@@ -416,7 +416,7 @@ iplists:
   
 - id: dataplane_org_telnet_login
   name: DataPlane TELNET login
-  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IPs performing<br>login via TELNET password authentication.
+  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IPs trying<br>an unsolicited login via TELNET password authentication.
   provider_link: https://dataplane.org/
   url: https://dataplane.org/telnetlogin.txt
   regex: ".*?(\\A)"
@@ -438,7 +438,7 @@ iplists:
   
 - id: dataplane_org_smtp_data
   name: DataPlane SMTP data
-  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that <br>have been identified as SMTP clients sending DATA commands.
+  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that <br>have been identified as SMTP clients sending unsolicited DATA commands.
   provider_link: https://dataplane.org/
   url: https://dataplane.org/smtpdata.txt
   regex: ".*?(\\A)"
@@ -460,7 +460,7 @@ iplists:
   
 - id: dataplane_org_sipinvitation
   name: DataPlane SIP invitation
-  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that<br>have been seen initiating a SIP INVITE operation to a remote host.
+  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that<br>have been seen initiating an unsolicited SIP INVITE operation to a remote host.
   firehol_link: http://iplists.firehol.org/?ipset=dataplane_sipinvitation
   provider_link: https://dataplane.org/
   url: https://dataplane.org/sipinvitation.txt
@@ -472,7 +472,7 @@ iplists:
   
 - id: dataplane_org_sipquery
   name: DataPlane SIP query
-  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that<br>has been seen initiating a SIP OPTIONS query to a remote host.
+  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that<br>has been seen initiating an unsolicited SIP OPTIONS query to a remote host.
   firehol_link: http://iplists.firehol.org/?ipset=dataplane_sipquery
   provider_link: https://dataplane.org/
   url: https://dataplane.org/sipquery.txt
@@ -484,7 +484,7 @@ iplists:
   
 - id: dataplane_org_sipregistration
   name: DataPlane SIP registration
-  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that<br>have been seen initiating a SIP REGISTER operation to a remote host.
+  descr: DataPlane.org is a community-powered Internet data, feeds,<br>and measurement resource for operators, by operators. IP addresses that<br>have been seen initiating an unsolicited SIP REGISTER operation to a remote host.
   firehol_link: http://iplists.firehol.org/?ipset=dataplane_sipregistration
   provider_link: https://dataplane.org/
   url: https://dataplane.org/sipregistration.txt
@@ -509,7 +509,7 @@ iplists:
 # AbuseIPDB blacklist is available to authorized users only, fill in the "Key" below with your API key and uncomment
 #- id: abuseipdb
 #  name: AbuseIPDB
-#  descr: AbuseIPDB is a project managed by Marathon Studios Inc.<br>IPs performing malicious activity(DDoS, spam, phishing...)
+#  descr: AbuseIPDB is a project managed by Marathon Studios Inc.<br>Lists IPs performing a malicious activity (DDoS, spam, phishing...)
 #  provider_link: https://www.abuseipdb.com/
 #  url: https://api.abuseipdb.com/api/v2/blacklist
 #  regex: ""
@@ -544,13 +544,3 @@ iplists:
 #  time:
 #    hour: 1,9,17
 #    minute: 45
-#
-#- id: crowdsec
-#  name: Crowdsec
-#  descr: Crowdsec community blacklist of malicious IPs.
-#  provider_link: https://docs.crowdsec.net/
-#  url: file:///data/blacklists/crowdsec.csv
-#  regex: "^(\\A),(.*)"
-#  time:
-#    hour: 1,9,17
-#    minute: 45
diff --git a/etc/threat_categorization.yml b/etc/threat_categorization.yml
index c707a354..9b925cf4 100644
--- a/etc/threat_categorization.yml
+++ b/etc/threat_categorization.yml
@@ -186,7 +186,7 @@ threat_categorization:
       misp_receiver: |-
         any([bool(re.findall(r'(?i)(malware|trojan|ransomware)', tag)) for tag in event.tags]) and event.ip_role == "dst"
       blacklists: |-
-        event.description == 'urlhouse_ips'
+        event.description == 'urlhaus_ips'
 
   cc:
     role: dst

From 7d00394b4ad610067086a48fc9d7f6f368f843f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Barnat?= <barnat@cesnet.cz>
Date: Wed, 8 Jan 2025 11:37:16 +0100
Subject: [PATCH 19/21] web: add API support for threat categorization

---
 NERDweb/nerd_main.py | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index 57be603d..8cb5d693 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -1806,7 +1806,7 @@ def get_ip_info(ipaddr, full):
     else:
         ipinfo = mongo.db.ip.find_one({'_id': ipint},
                                       {'rep': 1, 'fmp': 1, 'hostname': 1, 'bgppref': 1, 'ipblock': 1, 'geo': 1, 'bl': 1,
-                                       'tags': 1})
+                                       'tags': 1, '_threat_category_summary': 1})
     if not ipinfo:
         log_err.log('404_api_ip_not_found')
         data['err_n'] = 404
@@ -1927,6 +1927,15 @@ def get_basic_info_dic(val):
 
         tags_l.append(d)
 
+    threat_category_l = []
+    for rec in val.get('_threat_category_summary', []):
+        threat_category_l.append({
+            'role': rec['r'],
+            'category': rec['c'],
+            'subcategory': rec['s'],
+            'confidence': rec['conf'],
+        })
+
     data = {
         'ip': val['_id'],
         'rep': val.get('rep', 0.0),
@@ -1937,14 +1946,15 @@ def get_basic_info_dic(val):
         'asn': val.get('asn', []),
         'geo': geo_d,
         'bl': bl_l,
-        'tags': tags_l
+        'tags': tags_l,
+        'threat_category': threat_category_l
     }
 
     return data
 
 
 def get_basic_info_dic_short(val):
-    # only 'rep' and 'tags' fields
+    # only 'rep', 'tags' and 'threat_category' fields
     tags_l = []
     for l in val.get('tags', []):
         d = {
@@ -1953,10 +1963,20 @@ def get_basic_info_dic_short(val):
         }
         tags_l.append(d)
 
+    threat_category_l = []
+    for rec in val.get('_threat_category_summary', []):
+        threat_category_l.append({
+            'role': rec['r'],
+            'cat': rec['c'],
+            'subcategory': rec['s'],
+            'conf': rec['conf']
+        })
+
     data = {
         'ip': val['_id'],
         'rep': val.get('rep', 0.0),
-        'tags': tags_l
+        'tags': tags_l,
+        'threat_category': threat_category_l
     }
     return data
 
@@ -2098,6 +2118,12 @@ def get_full_info(ipaddr=None):
             'total7': val.get('events_meta', {}).get('total7', 0.0),
             'total30': val.get('events_meta', {}).get('total30', 0.0),
         },
+        'threat_category': [{
+            'role': rec['r'],
+            'category': rec['c'],
+            'confidence': rec['conf'],
+            'sources': rec['src']
+        } for rec in val.get('_threat_category_summary', [])]
     }
 
     return Response(json.dumps(data), 200, mimetype='application/json')

From b9a5ab5b80ea5cee61cb56bcfe032c1c95347cdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=A1clav=20Barto=C5=A1?= <bartos@cesnet.cz>
Date: Tue, 26 Aug 2025 16:30:49 +0200
Subject: [PATCH 20/21] threat_categorization: rozpracovane zmeny

---
 NERDd/modules/threat_category_summary.py |  2 +-
 NERDweb/nerd_main.py                     | 66 +++++++++++++++---------
 NERDweb/static/main.js                   |  2 +-
 NERDweb/templates/ip.html                |  6 +--
 common/threat_categorization.py          |  3 +-
 5 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/NERDd/modules/threat_category_summary.py b/NERDd/modules/threat_category_summary.py
index aef673de..80877633 100644
--- a/NERDd/modules/threat_category_summary.py
+++ b/NERDd/modules/threat_category_summary.py
@@ -1,7 +1,7 @@
 """
 NERD module summarizing threat category records.
 
-Should be triggered at least once a day for every address.
+Triggered when _threat_category attribute changes or at least once a day for every address.
 """
 
 from core.basemodule import NERDModule
diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index 8cb5d693..3585399f 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -74,6 +74,19 @@
 categorization_cfg_file = os.path.join(cfg_dir, 'threat_categorization.yml')
 threat_categorization_config = common.config.read_config(categorization_cfg_file)["threat_categorization"]
 
+# Mapping of source IDs usind in DB (e.g. as ttl token) to user readable name
+# This list is used to generate:
+# - a drop-down menu in search form
+# - lists of sources in threat_category tooltips
+# (what is defined here appears on the web, in the same order)
+SOURCE_NAMES = {
+    "warden": "Warden",
+    "bl": "Blacklists",
+    "dshield": "DShield",
+    "otx": "OTX",
+    "misp": "MISP",
+}
+
 # Dict: blacklist_id -> parameters
 #  parameters should contain:
 #    all: id, name, descr, feed_type
@@ -884,18 +897,10 @@ def __init__(self, *args, **kwargs):
         self.country.choices = [(i, '{} - {}'.format(i, ctrydata.names[i])) for i in ctrydata.names.keys()]
 
         # Load numbers of IPs per data source (also precomputed in DB)
-        #  (Numbers of IPs per source are computed from TTL tokens; list of sources to show is hard-coded here, since
+        #  (Numbers of IPs per source are computed from TTL tokens; list of sources to show is hard-coded, since
         #   we don't want to show all used TTL token IDs as data sources.)
-        # mapping of DB name (ttl token) -> user readable name (what is defined here appears on the web, in the same order)
-        source_names = {
-            "warden": "Warden",
-            "bl": "Blacklists",
-            "dshield": "DShield",
-            "otx": "OTX",
-            "misp": "MISP",
-        }
         cnt_by_source = {item["_id"]: item["n"] for item in mongo.db.n_ip_by_ttl.find()}
-        self.source.choices = [(src_id, '{} ({})'.format(src_name, int(cnt_by_source.get(src_id, 0)))) for src_id,src_name in source_names.items()]
+        self.source.choices = [(src_id, '{} ({})'.format(src_name, int(cnt_by_source.get(src_id, 0)))) for src_id,src_name in SOURCE_NAMES.items()]
 
         # Load categorization config to get list of all categories
         self.tc_role.choices = [("src", "Source"), ("dst", "Destination")]
@@ -1171,31 +1176,42 @@ def ips():
 
 
 def create_threat_category_table(category_records, min_confidence, max_subcategory_values):
-    source_names = {
-        'warden': 'Warden',
-        'misp': 'MISP',
-        'otx': 'OTX',
-        'dshield': 'DShield',
-        'bl': 'Blacklists'
-    }
+    """Prepare data about threat category tags - for ips.html as well as the table in ip.html"""
     table_rows = []
     for rec in category_records:
+        # rec is dict with the following fields:
+        #   'r':str - role (src/dst)
+        #   'c':str - category
+        #   'src':dist[str,int] - number of events per source
+        #   's':dict[str,Any] - subcategories/details (proto, port, malware_family)
+        #   'conf':float - confidence
         if rec['conf'] < min_confidence:
             continue
+
+        # Generate tooltip content (as html string)
+        # TODO: This should be done in Jinja template, not here
         category_description = threat_categorization_config.get(rec['c'], {}).get('description', f"ERROR: missing configuration for category '{rec['c']}'")
-        sources_str = ''.join([f"<li>{source_names[source]} ({n_reports})</li>" for source, n_reports in sorted(rec['src'].items())])
+        sources_str = ''.join([f"<li>{SOURCE_NAMES[source]} ({n_reports})</li>" for source, n_reports in sorted(rec['src'].items())])
         tooltip_content = f"<b>{category_description}</b><br/><br/>Confidence: {rec['conf']}<br/>Sources:<br/><ul>{sources_str}</ul>"
+
+        # Generate table rows
+        # row = [role, category, subcategory, confidence, tooltip content]
         subcategories = list(rec['s'].items())
+        # No subcategories -> create single line
         if not subcategories:
-            table_rows.append([rec['r'], rec['c'], "", tooltip_content])
+            table_rows.append([rec['r'], rec['c'], "", rec['conf'], tooltip_content])
+        # Subcategories
         else:
-            key, values = subcategories[0]
-            subcategory_content = f"{key}: {', '.join(values)}" if len(values) <= max_subcategory_values else f"{key}: <i>many</i>"
-            table_rows.append([rec['r'], rec['c'], subcategory_content, tooltip_content])
-            for item in subcategories[1:]:
-                key, values = item
+            # key, values = subcategories[0]
+            # subcategory_content = f"{key}: {', '.join(values)}" if len(values) <= max_subcategory_values else f"{key}: <i>many</i>"
+            # table_rows.append([rec['r'], rec['c'], subcategory_content, tooltip_content])
+            # for item in subcategories[1:]:
+            #     key, values = item
+            #     subcategory_content = f"{key}: {', '.join(values)}" if len(values) <= max_subcategory_values else f"{key}: <i>many</i>"
+            #     table_rows.append(["", "", subcategory_content, tooltip_content])
+            for key, values in subcategories[1:]:
                 subcategory_content = f"{key}: {', '.join(values)}" if len(values) <= max_subcategory_values else f"{key}: <i>many</i>"
-                table_rows.append(["", "", subcategory_content, tooltip_content])
+                table_rows.append([rec['r'], rec['c'], subcategory_content, rec['conf'], tooltip_content])
     return table_rows
 
 
diff --git a/NERDweb/static/main.js b/NERDweb/static/main.js
index e21b9795..2c049f6e 100644
--- a/NERDweb/static/main.js
+++ b/NERDweb/static/main.js
@@ -2,7 +2,7 @@
 
 function create_event_table(data) { /* data are "dataset" field of a DOM node with "data-" attributes set */
   if (data.table == "") {
-     return "No events";
+     return "No Warden events";
   }
   var cats = data.cats.split(",");
   var dates = data.dates.split(",");
diff --git a/NERDweb/templates/ip.html b/NERDweb/templates/ip.html
index de5debf7..b6066279 100644
--- a/NERDweb/templates/ip.html
+++ b/NERDweb/templates/ip.html
@@ -292,10 +292,10 @@ <h1>IP address</h1>
 <p class="caption">Threat category</p>
 <div class="threat_category_detail">
     <table>
-        <tr><th>Role</th><th>Category</th><th>Subcategory</th></tr>
+        <tr><th>Role</th><th>Category</th><th>Details</th><th>Confidence</th></tr>
         {% for row in threat_category_table %}
-            <tr class="threat_category_tooltip" title="{{ row[3] }}">
-            {% for col in row[:3] %}
+            <tr class="threat_category_tooltip" title="{{ row[4] }}">
+            {% for col in row[:4] %}
                 <td>{{ col|safe }}</td>
             {% endfor %}
             </tr>
diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index 115848fd..c0fe2c73 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -52,7 +52,8 @@ def init_warden_receiver(self, event, source):
             target_ports += target.get('Port', [])
             protocols += target.get('Proto', [])
         self.target_ports = [str(port) for port in set(target_ports)]
-        self.protocols = list(set(protocols))
+        # Protocol list often contains 'tcp'/'udp', but we only want L7 protocol here -> remove the two
+        self.protocols = list(set(protocols) - {'tcp', 'udp'})
 
     def init_otx_receiver(self, pulse):
         """

From 1bea8bf784a92ec9803e495abb512fb6730e9254 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=A1clav=20Barto=C5=A1?= <bartos@cesnet.cz>
Date: Tue, 10 Feb 2026 19:15:42 +0100
Subject: [PATCH 21/21] threat_categorization

- Visualization on web finished
- Some other changes
---
 NERDd/blacklists.py                      |  2 +-
 NERDd/core/update_manager.py             | 49 ++++++++++--------
 NERDd/g.py                               |  5 ++
 NERDd/misp_receiver.py                   |  2 +-
 NERDd/modules/dnsbl.py                   | 19 +++++--
 NERDd/modules/threat_category_summary.py |  4 +-
 NERDd/otx_receiver.py                    |  2 +-
 NERDd/warden_receiver.py                 |  2 +-
 NERDweb/nerd_main.py                     | 64 ++++++++++++++++++------
 NERDweb/static/style.css                 | 45 ++++++++++++++---
 NERDweb/templates/ip.html                | 43 +++++++++-------
 NERDweb/templates/ips.html               | 39 ++++++++-------
 common/config.py                         |  9 +++-
 common/threat_categorization.py          |  7 +--
 etc/threat_categorization.yml            | 34 +++++++++----
 scripts/generate_ip_category_files.py    |  2 +-
 16 files changed, 224 insertions(+), 104 deletions(-)

diff --git a/NERDd/blacklists.py b/NERDd/blacklists.py
index f99b05c9..3a37fc03 100755
--- a/NERDd/blacklists.py
+++ b/NERDd/blacklists.py
@@ -238,7 +238,7 @@ def stop(signal, frame):
     log.info("Loading config file {}".format(categorization_cfg_file))
     config.update(common.config.read_config(categorization_cfg_file))
     categorization_config = {
-        "categories": config.get('threat_categorization'),
+        "categories": config.get('threat_categories'),
         "malware_families": common.config.read_config(config.get('malpedia_family_list_path'))
     }
 
diff --git a/NERDd/core/update_manager.py b/NERDd/core/update_manager.py
index 025f9167..5434302a 100644
--- a/NERDd/core/update_manager.py
+++ b/NERDd/core/update_manager.py
@@ -303,9 +303,11 @@ def __init__(self, config, db, process_index, num_processes):
         self.elog_op = g.ecl.get_group("rec_ops", True) # True = return DummyEventGroup if there's no configuration for given group name
         self.elog_by_src = g.ecl.get_group("tasks_by_src", True)
 
-        # This is here for performance debugging - measuring the time spent in each handler function
-#        self.t_handlers = Counter()
-#        self.logging_scheduler.register(self.log_t_handlers, second="*/60")
+        # Counter for performance debugging - measuring the time spent in each handler function
+        # A summary is printed every minute
+        if g.DEBUG_PERFORMANCE:
+            self.t_handlers = Counter()
+            g.scheduler.register(self.log_t_handlers, minute="*")
 
 
     def register_handler(self, func, etype, triggers, changes):
@@ -565,7 +567,8 @@ def _process_update_req(self, etype, eid, update_requests):
         #     updates (2-tuples (key, new_value) or (event, param) which triggered the function.
         #   may_change - set of attributes that may be changed by planned function calls
 
-#        t1 = time.time()
+        if g.DEBUG_PERFORMANCE:
+            t1 = time.time()
 
         # Check whether a new record should not be created in case every operation is 'weak' (starts with '*')
         weak_op = True
@@ -603,9 +606,10 @@ def _process_update_req(self, etype, eid, update_requests):
             return False
         
         requests_to_process = update_requests
-        
-#        t2 = time.time()
-#        t_handlers = {}
+
+        if g.DEBUG_PERFORMANCE:
+            t2 = time.time()
+            t_handlers = {}
         
         # *** Now we have the record, process the requested updates ***
         
@@ -699,7 +703,8 @@ def _process_update_req(self, etype, eid, update_requests):
             # Call the event handler function.
             # Set of requested updates of the record should be returned
             #self.log.debug("Calling: {}(({}, {}), rec, {})".format(get_func_name(func), etype, eid, updates))
-#            t_handler1 = time.time()
+            if g.DEBUG_PERFORMANCE:
+                t_handler1 = time.time()
             try:
                 reqs = func((etype, eid), rec, updates)
             except Exception as e:
@@ -707,8 +712,10 @@ def _process_update_req(self, etype, eid, update_requests):
                     .format(get_func_name(func), etype, eid, updates) )
                 g.ecl['errors'].log('error_in_module')
                 reqs = []
-#            t_handler2 = time.time()
-#            t_handlers[get_func_name(func)] = t_handler2 - t_handler1
+            if g.DEBUG_PERFORMANCE:
+                t_handler2 = time.time()
+                # Add time spent in this handler function to global counter
+                t_handlers[get_func_name(func)] = t_handler2 - t_handler1
 
             # Set requested updates to requests_to_process
             if reqs:
@@ -728,8 +735,9 @@ def _process_update_req(self, etype, eid, update_requests):
         
         # Set ts_last_update
         rec['ts_last_update'] = datetime.utcnow()
-        
-#        t3 = time.time()
+
+        if g.DEBUG_PERFORMANCE:
+            t3 = time.time()
 
         # Remove or update processed database record
         if deletion:
@@ -740,13 +748,14 @@ def _process_update_req(self, etype, eid, update_requests):
             self.db.put(etype, eid, rec)
             self.elog_op.log(etype+'_updated') # normal record update
 
-        
-#        t4 = time.time()
-#        #if t4 - t1 > 1.0:
-#        #    self.log.info("Entity ({}:{}): load: {:.3f}s, process: {:.3f}s, store: {:.3f}s".format(etype, eid, t2-t1, t3-t2, t4-t3))
-#        #    self.log.info("  handlers:" + ", ".join("{}: {:.3f}s".format(fname, t) for fname, t in t_handlers))
-#
-#        self.t_handlers.update(t_handlers)
+        if g.DEBUG_PERFORMANCE:
+            t4 = time.time()
+            # If this task took longer than 1.0 sec, print details
+            if t4 - t1 > 1.0:
+                self.log.warning("Task processing took too long: Entity ({}:{}): load: {:.3f}s, process: {:.3f}s, store: {:.3f}s\n".format(etype, eid, t2-t1, t3-t2, t4-t3) +
+                    "  handlers: " + ", ".join("{}: {:.3f}s".format(fname, t) for fname, t in t_handlers.items()))
+            # Add run-times of handler functions to the global counter
+            self.t_handlers.update(t_handlers)
         
         return new_rec_created
 
@@ -773,7 +782,7 @@ def dump_handler_chain(self, etype):
         return s
 
     def log_t_handlers(self):
-        print("Handler function running times:")
+        print("Handler function running times (total times within last 60 seconds, top-10):")
         for name, t in self.t_handlers.most_common(10):
             print("{:50s} {:7.3f}".format(name, t))
         self.t_handlers = Counter()
diff --git a/NERDd/g.py b/NERDd/g.py
index 2d139070..e2e0ddc4 100644
--- a/NERDd/g.py
+++ b/NERDd/g.py
@@ -4,3 +4,8 @@
 
 # Global flag telling if the daemon is running
 running = False
+
+# Enable performance debugging (must be manually set to True here; TODO: move to config)
+# (used in core/update_manager.py and modules/dnsbl.py)
+# May result in many log messages, only enable if you have issues with performance (task processing takes too long)
+DEBUG_PERFORMANCE = False
diff --git a/NERDd/misp_receiver.py b/NERDd/misp_receiver.py
index 0269ed12..0052cac6 100644
--- a/NERDd/misp_receiver.py
+++ b/NERDd/misp_receiver.py
@@ -67,7 +67,7 @@
 logger.info("Loading config file {}".format(categorization_cfg_file))
 config.update(read_config(categorization_cfg_file))
 categorization_config = {
-    "categories": config.get('threat_categorization'),
+    "categories": config.get('threat_categories'),
     "malware_families": read_config(config.get('malpedia_family_list_path'))
 }
 
diff --git a/NERDd/modules/dnsbl.py b/NERDd/modules/dnsbl.py
index cc0fefcc..c79fd884 100644
--- a/NERDd/modules/dnsbl.py
+++ b/NERDd/modules/dnsbl.py
@@ -16,8 +16,11 @@
 import socket
 import logging
 import threading
+import time
 from datetime import datetime, date, timezone
 
+# Print a warning if a DNS query takes longer than 2 seconds (only if performance debugging is enabled)
+LONG_QUERY_THRESHOLD = 2.0 if g.DEBUG_PERFORMANCE else None
 
 # From pycares example "cares-select.py"
 # https://github.com/saghul/pycares/blob/master/examples/cares-select.py
@@ -38,14 +41,14 @@ def _wait_channel(channel):
             channel.process_fd(pycares.ARES_SOCKET_BAD, fd)
 
 
-def _make_result_handler(bl, results):
+def _make_result_handler(bl, results, start_time, log):
     """
     Create callback function using given blacklist spec and writing to given
     results array.
     (note: if you don't understand this way of making a function, google 
     "python closure")
     
-    bl - blacklist configuration (name, zone, dict{result -> blacklist_id})
+    bl - blacklist configuration (zone, dict{result -> blacklist_id})
     results - list to put blacklist_ids
     """
     def handler(res, err):
@@ -55,6 +58,10 @@ def handler(res, err):
         res - list of results (tuples hostname,ttl)
         err - error code
         """
+        if LONG_QUERY_THRESHOLD:
+            query_time = time.time() - start_time
+            if query_time > LONG_QUERY_THRESHOLD:
+                log.warning(f"Long DNSBL query: {bl[0]}, {query_time} sec, results: {res}")
         if res is not None:
             for r in res:
                 blacklist = bl[1].get(r.host, {})
@@ -207,14 +214,16 @@ def query_blacklists(self, ekey, rec, updates):
         revip = reverse_ip(ip)
 
         self.log.debug("Querying blacklists for {}".format(ekey))
-        
-        channel = pycares.Channel(servers=self.nameservers)
+
+        # Set ares channel to use given nameservers and set timeout to 2 seconds and max 2 tries
+        channel = pycares.Channel(servers=self.nameservers, timeout=2, tries=2)
         results = []        
         
         # Create queries to all blacklists
         for bl in self.blacklists.items():
+            start_time = time.time()
             channel.query(revip + '.' + bl[0], pycares.QUERY_TYPE_A,
-                _make_result_handler(bl, results)
+                _make_result_handler(bl, results, start_time, self.log)
             )
         # Send all queries and wait for results
         #(they are handled by self._process_result callback)
diff --git a/NERDd/modules/threat_category_summary.py b/NERDd/modules/threat_category_summary.py
index 80877633..206895c3 100644
--- a/NERDd/modules/threat_category_summary.py
+++ b/NERDd/modules/threat_category_summary.py
@@ -28,7 +28,7 @@ class ThreatCategorySummary(NERDModule):
 
     def __init__(self):
         categorization_config_file = os.path.join(g.config_base_path, g.config.get("threat_categorization_config"))
-        self.config = common.config.read_config(categorization_config_file).get("threat_categorization", {})
+        self.config = common.config.read_config(categorization_config_file)
 
         g.um.register_handler(
             self.create_summary,  # function (or bound method) to call
@@ -76,7 +76,7 @@ def create_summary(self, ekey, rec, updates):
         summary = []
 
         for cat, records in grouped_by_category.items():
-            role = self.config[cat]['role']
+            role = self.config.get(f'threat_categories.{cat}.role', '?')
             cat_summary = {
                 'r': role,
                 'c': cat,
diff --git a/NERDd/otx_receiver.py b/NERDd/otx_receiver.py
index 080b4b41..20243d43 100644
--- a/NERDd/otx_receiver.py
+++ b/NERDd/otx_receiver.py
@@ -87,7 +87,7 @@ def parse_datetime(time_str):
 logger.info("Loading config file {}".format(categorization_cfg_file))
 config.update(read_config(categorization_cfg_file))
 categorization_config = {
-    "categories": config.get('threat_categorization'),
+    "categories": config.get('threat_categories'),
     "malware_families": read_config(config.get('malpedia_family_list_path'))
 }
 
diff --git a/NERDd/warden_receiver.py b/NERDd/warden_receiver.py
index 6818c6a5..9f3bc0cd 100644
--- a/NERDd/warden_receiver.py
+++ b/NERDd/warden_receiver.py
@@ -620,7 +620,7 @@ def receive_events(filer_path, eventdb, task_queue_writer, inactive_ip_lifetime,
     rabbit_config = config.get("rabbitmq")
     filer_path = config.get('warden_filer_path')
     categorization_config = {
-        "categories": config.get('threat_categorization'),
+        "categories": config.get('threat_categories'),
         "malware_families": common.config.read_config(config.get('malpedia_family_list_path'))
     }
 
diff --git a/NERDweb/nerd_main.py b/NERDweb/nerd_main.py
index 0a0336a5..bda1c4bd 100644
--- a/NERDweb/nerd_main.py
+++ b/NERDweb/nerd_main.py
@@ -72,7 +72,7 @@
 
 # Read threat categorization config
 categorization_cfg_file = os.path.join(cfg_dir, 'threat_categorization.yml')
-threat_categorization_config = common.config.read_config(categorization_cfg_file)["threat_categorization"]
+threat_cat_config = common.config.read_config(categorization_cfg_file)
 
 # Mapping of source IDs usind in DB (e.g. as ttl token) to user readable name
 # This list is used to generate:
@@ -906,7 +906,7 @@ def __init__(self, *args, **kwargs):
 
         # Load categorization config to get list of all categories
         self.tc_role.choices = [("src", "Source"), ("dst", "Destination")]
-        self.tc_category.choices = sorted([(cat_id, cat_data['label']) for cat_id, cat_data in threat_categorization_config.items()])
+        self.tc_category.choices = sorted([(cat_id, cat_data['label']) for cat_id, cat_data in threat_cat_config["threat_categories"].items()])
         self.tc_subcategory_key.choices = [("", "--"), ("port", "Port"), ("protocol", "Protocol"), ("malware_family", "Malware family")]
 
         # Number of occurrences for blacklists (list of blacklists is taken from configuration)
@@ -1167,7 +1167,7 @@ def ips():
 
             # Add info about threat category
             min_confidence = float(form.tc_confidence.data) if form.tc_confidence.data else 0
-            ip['_threat_category_table'] = create_threat_category_table(ip.get('_threat_category_summary', []), min_confidence, 9)
+            ip['_threat_category_data_for_tags'] = create_threat_category_data_for_tags(ip.get('_threat_category_summary', []), min_confidence, 9)
     else:
         results = None
         form.ip_list.data = ""
@@ -1177,6 +1177,7 @@ def ips():
     return render_template('ips.html', json=json, ctrydata=ctrydata, blacklist_info=blacklist_info, **locals())
 
 
+# TODO move near the "ip" endpoint
 def create_threat_category_table(category_records, min_confidence, max_subcategory_values):
     """Prepare data about threat category tags - for ips.html as well as the table in ip.html"""
     table_rows = []
@@ -1189,12 +1190,7 @@ def create_threat_category_table(category_records, min_confidence, max_subcatego
         #   'conf':float - confidence
         if rec['conf'] < min_confidence:
             continue
-
-        # Generate tooltip content (as html string)
-        # TODO: This should be done in Jinja template, not here
-        category_description = threat_categorization_config.get(rec['c'], {}).get('description', f"ERROR: missing configuration for category '{rec['c']}'")
-        sources_str = ''.join([f"<li>{SOURCE_NAMES[source]} ({n_reports})</li>" for source, n_reports in sorted(rec['src'].items())])
-        tooltip_content = f"<b>{category_description}</b><br/><br/>Confidence: {rec['conf']}<br/>Sources:<br/><ul>{sources_str}</ul>"
+        tooltip_content = get_threat_category_tooltip(rec)
 
         # Generate table rows
         # row = [role, category, subcategory, confidence, tooltip content]
@@ -1216,6 +1212,42 @@ def create_threat_category_table(category_records, min_confidence, max_subcatego
                 table_rows.append([rec['r'], rec['c'], subcategory_content, rec['conf'], tooltip_content])
     return table_rows
 
+def create_threat_category_data_for_tags(category_records, min_confidence, max_subcategory_values):
+    """Prepare data for threat category tags in search results and for table in IP datail page"""
+    TAG_DEFAULT_COLOR = '#777777' # if color is not defined in configuration
+    rows = []
+    for rec in category_records:
+        if rec['conf'] < min_confidence:
+            continue
+        subcategories = []
+        for key,values in rec['s'].items(): # subcategories
+            if len(values) <= max_subcategory_values:
+                # sort values (numerically if port numbers, lexicographically otherwise)
+                if key == 'port':
+                    values.sort(key=int)
+                else:
+                    values.sort()
+                subcategories.append(f"{key}: {', '.join(values)}")
+            else:
+                subcategories.append(f"{key}: <i>many</i>")
+        rows.append({
+            'role': rec['r'],
+            'role_color': threat_cat_config.get("role_colors", {}).get(rec['r'], TAG_DEFAULT_COLOR),
+            'cat': rec['c'] if rec['c'] != "unknown" else "&mdash;", # replace "unknown" with "—"
+            'cat_color': threat_cat_config.get(f"threat_categories.{rec['c']}.color", TAG_DEFAULT_COLOR),
+            'subcats': subcategories,
+            'conf': rec['conf'],
+            'tooltip': get_threat_category_tooltip(rec)
+        })
+    return rows
+
+def get_threat_category_tooltip(rec):
+    # TODO this should be generated in a Jinja2 template or JavaScript, not here
+    category_description = threat_cat_config.get(f"threat_categories.{rec['c']}.description", f"ERROR: missing configuration for category '{rec['c']}'")
+    sources_str = ''.join([f"<li>{SOURCE_NAMES[source]} ({n_reports})</li>" for source, n_reports in sorted(rec['src'].items())])
+    return f"Category \"{rec['c']}\":<br><b>{category_description}</b><br><br>Sources reporting the IP under this category (number of alerts/reports in last 14 days):<ul>{sources_str}</ul><br>Confidence: {rec['conf']}"
+
+
 
 @app.route('/_ips_count', methods=["POST"])
 def ips_count():
@@ -1400,7 +1432,8 @@ def ip(ipaddr=None):
                 ipinfo['asns'] = asn_list
 
                 # Create threat category table
-                threat_category_table = create_threat_category_table(ipinfo.get('_threat_category_summary', []), 0, 9)
+                #threat_category_table = create_threat_category_table(ipinfo.get('_threat_category_summary', []), 0, 9)
+                threat_category_data = create_threat_category_data_for_tags(ipinfo.get('_threat_category_summary', []), 0, 9)
 
                 # Pseudonymize node names if user is not allowed to see the original names
                 if not g.ac('nodenames'):
@@ -1747,7 +1780,7 @@ def map_index():
 ]
 
 # Add category blacklist files (created by /scripts/generate_category_blocklist.sh)
-BL_FILES = [f"bl_{cat}.txt" for cat in threat_categorization_config if cat != "unknown"]
+BL_FILES = [f"bl_{cat}.txt" for cat in threat_cat_config["threat_categories"] if cat != "unknown"]
 FILES += BL_FILES
 
 @app.route('/data/')
@@ -1986,9 +2019,9 @@ def get_basic_info_dic_short(val):
     for rec in val.get('_threat_category_summary', []):
         threat_category_l.append({
             'role': rec['r'],
-            'cat': rec['c'],
+            'category': rec['c'],
             'subcategory': rec['s'],
-            'conf': rec['conf']
+            'confidence': rec['conf'],
         })
 
     data = {
@@ -2140,9 +2173,10 @@ def get_full_info(ipaddr=None):
         'threat_category': [{
             'role': rec['r'],
             'category': rec['c'],
+            'subcategory': rec['s'],
             'confidence': rec['conf'],
             'sources': rec['src']
-        } for rec in val.get('_threat_category_summary', [])]
+        } for rec in val.get('_threat_category_summary', [])],
     }
 
     return Response(json.dumps(data), 200, mimetype='application/json')
@@ -2484,4 +2518,4 @@ def get_shodan_response(ipaddr=None):
     config['login']['methods'] = {}
     # Run built-in server
     app.run(host="127.0.0.1", debug=True)
-    
\ No newline at end of file
+    
diff --git a/NERDweb/static/style.css b/NERDweb/static/style.css
index 8f2c98be..bc73f20e 100644
--- a/NERDweb/static/style.css
+++ b/NERDweb/static/style.css
@@ -35,6 +35,9 @@ hr {
   padding: 0.25em 0.4em;
   max-width: 80em;
 }
+.ui-tooltip ul {
+  margin: 0;
+}
 
 p.error {
   color: #900;
@@ -489,6 +492,30 @@ td.country a {
   color: inherit;
 }
 
+.threat_tag {
+  display: inline-table;
+  font-size: 1em;
+  color: #000;
+  /* color will be rewritten by inline styles according to the threat category */
+  background: #fff;
+  border-radius: 5px 0 0 5px;
+  border: 0;
+  border-right: 3px solid;
+  border-collapse: collapse;
+  box-shadow: 1px 1px 1px 1px rgba(0,0,0,0.1);
+  margin: 0 0.2em;
+}
+.threat_tag td {
+  border: 0;
+  border-right: 1px solid #777;
+}
+.threat_tag td:nth-child(3) {
+  font-size: 0.8em;
+  border-right: 0;
+}
+
+
+/*
 .threat_category_preview table {
   text-align: left;
   border: hidden;
@@ -504,25 +531,31 @@ td.country a {
   padding: 0 0.5em;
   min-width: 9.5em;
 }
-
+*/
 .threat_category_detail table {
   text-align: left;
   border: solid #000;
   border-width: 1px 1px 1px 1px;
+  border-collapse: collapse;
 }
 .threat_category_detail table td,
 .threat_category_detail table th {
   border: solid #000;
-  border-width: 1px 1px 1px 1px;
+  border-width: 1px 0px 0px 0px;
   padding: 0.1em 0.4em;
-}
-.threat_category_detail table td {
-  text-align: left;
+  height: 2em;
+  text-align: center;
 }
 .threat_category_detail table th {
-  text-align: center;
+  border-width: 1px 1px 1px 1px;
+}
+.threat_category_detail table td:nth-child(n+3), /* 3rd and following cells */
+.threat_category_detail table th:nth-child(n+3)
+{
+  text-align: left;
 }
 
+
 td.events {
   text-align: right;
   padding-right: 0;
diff --git a/NERDweb/templates/ip.html b/NERDweb/templates/ip.html
index cd02bde2..833b2ae8 100644
--- a/NERDweb/templates/ip.html
+++ b/NERDweb/templates/ip.html
@@ -155,6 +155,27 @@ <h1>IP address</h1>
   {% set dbl = ipinfo.pop('dbl') %}
 {% endif %}
 
+{# Threat category summary #}
+<p class="caption">Threat categories</p>
+<div class="threat_category_detail">
+<table>
+    <tr><th title="Threat level">TL</th><th>Role</th><th>Category</th><th>Details</th></tr>
+    {% if threat_category_data %}
+    {% for tag in threat_category_data %} {# items in "tag": role, role_color, cat, cat_color, subcats, conf, tooltip #}
+    <tr class="threat_category_tooltip" title="{{ tag['tooltip'] }}">
+      <td>{{ (tag.conf*100)|round|int }}</td>
+      <td style="background-color: {{tag.role_color}}">{{ tag.role|safe }}</td>
+      <td style="background-color: {{tag.cat_color}}">{{ tag.cat|safe }}</td>
+      <td style="background-color: {{tag.cat_color}}">{% for subcat in tag.subcats %}{{ subcat|safe }}<br>{% endfor %}</td>
+    </tr>
+    {% endfor %}
+    {% else %}
+    <tr><td colspan="4">No threat category tags assigned</td></tr>
+    {% endif %}
+</table>
+</div>
+<hr>
+
 {# Warden events #}
 {% if ipinfo.events %}
   <dt>Warden events ({{ipinfo.events_meta.total}})</dt>
@@ -180,6 +201,7 @@ <h1>IP address</h1>
 {% endif %}
 
 {# MISP events #}
+{# TODO: filter "showable" events in backend! #}
 {% if ipinfo.misp_events %}
   {% set misp_events = namespace(showable=0) -%}
   {% for misp_event in ipinfo.misp_events -%}
@@ -187,10 +209,10 @@ <h1>IP address</h1>
           {% set misp_events.showable = misp_events.showable + 1 -%}
       {% endif -%}
   {% endfor -%}
-  {% if misp_events.showable %}
+  {% if misp_events.showable > 0 %}
       <dt> MISP events </dt>
       <dd{% if misp_events.showable > 1%} class="scrollable"{% endif %}>
-       {% for misp_event in val|sort(attribute='date', reverse=True) %}
+       {% for misp_event in ipinfo.misp_events|sort(attribute='date', reverse=True) %}
             {% if misp_event.tlp == "white" or (misp_event.tlp == "green" and ac('tlp-green')) %}
                <div><b>[<a href="{{ url_for('misp_event') + misp_event.event_id }}">{{ misp_event.event_id }}</a>] {{ misp_event.pop('date', 'no date') }} | {{ misp_event.pop('info', 'no info') }}</b>
                <table style="padding-left: 30px; padding-bottom: 10px">
@@ -287,23 +309,6 @@ <h1>IP address</h1>
 {% endfor %}
 </dl>
 
-{# Threat category summary #}
-{% if threat_category_table %}
-<p class="caption">Threat category</p>
-<div class="threat_category_detail">
-    <table>
-        <tr><th>Role</th><th>Category</th><th>Details</th><th>Confidence</th></tr>
-        {% for row in threat_category_table %}
-            <tr class="threat_category_tooltip" title="{{ row[4] }}">
-            {% for col in row[:4] %}
-                <td>{{ col|safe }}</td>
-            {% endfor %}
-            </tr>
-        {% endfor %}
-    </table>
-</div>
-{% endif %}
-
 {# Event plot WARDEN -#}
 <p class="caption">Warden event timeline</p>
 <div class="chart-container" style="position: relative; width: 100%; height: 20em">
diff --git a/NERDweb/templates/ips.html b/NERDweb/templates/ips.html
index 39693d60..76d08474 100644
--- a/NERDweb/templates/ips.html
+++ b/NERDweb/templates/ips.html
@@ -319,8 +319,8 @@ <h1>Search IP addresses by ...</h1>
   <th>Device type</th>-->
   <th title="Reputation score (first experimental algorithm - takes into account number of events and number of detectors per day, from last 14 days with linearly decreasing weight by age)">Rep.<sup><a href="https://github.com/CESNET/NERD/wiki/Reputation-score" title="More information about reputation score" target="_blank">(?)</a></sup></th>
   {% if ac('fmp') %}<th title="Future Maliciousness Probability score">FMP</th>{% endif %}
-  <th>Other properties</th>
   <th>Threat category</th>
+  <th>Other properties</th>
   <th>Time added</th>
   <th>Last activity</th>
   <th title="Links to external services">Links</th>
@@ -367,6 +367,23 @@ <h1>Search IP addresses by ...</h1>
     {{ "%.3f"|format(ip.fmp.general) if fmp is defined else "---" }}
   </td>
   {% endif %}
+  <td>
+    <div class="threat_category_preview">
+      {% if ip._threat_category_data_for_tags %}
+        {% for tag in ip._threat_category_data_for_tags %}
+            <table class="threat_tag threat_category_tooltip" style="border-color: {{tag.role_color}}" title="{{ tag.tooltip }}">
+            <tr>
+              <td class="threat_tag_1" style="background-color: {{tag.role_color}}">{{ tag.role|safe }}</td>
+              <td class="threat_tag_2" style="background-color: {{tag.cat_color}}">{{ tag.cat|safe }}</td>
+              {% if tag.subcats -%}
+              <td class="threat_tag_3" style="background-color: {{tag.cat_color}}">{% for subcat in tag.subcats %}{{ subcat|safe }}<br>{% endfor %}</td>
+              {%- endif %}
+            </tr>
+            </table>
+        {% endfor %}
+      {% endif %}
+    </div>
+  </td>
   <td class="other">
     {% if ip.bl -%}
       {% set bl_cnt = ip.bl|selectattr("v")|list|length -%}
@@ -393,7 +410,8 @@ <h1>Search IP addresses by ...</h1>
     {% if ip.open_ntp %}<span class="tag amplifier ntp">Open NTP</span>{% endif %}
     {% if ip.open_snmp %}<span class="tag amplifier snmp">Open SNMP</span>{% endif %}
   #}
-  
+
+  {#
     {% if ip.tags %}
       {% for tag_id,tag_param in ip.tags.items() %}
         {% if tag_id in config_tags and "name" in config_tags[tag_id] %}
@@ -429,27 +447,12 @@ <h1>Search IP addresses by ...</h1>
         {% endif %}
       {% endfor %}
     {% endif %}
-
+  #}
     {% if ip.shodan %}
       {% if ip.shodan.ports %}<span class="tag shodan" title="Ports opened on that IP according to Shodan's InternetDB:<br>{{ip.shodan.ports|join(', ')}}<br>Note, that this information may be more than a week old. Click on Shodan logo to see the latest information on the Shodan web."><a href="https://www.shodan.io/host/{{ ip._id }}" target="_blank"><img src="{{ url_for('static', filename='shodan_icon.png') }}" style="width: 0.9em; height: 0.9em"></a>&nbsp;{{ip.shodan.ports|join_max(5)}}</span>{% endif %}
       {% if ip.shodan.tags %}<span class="tag shodan" title="Tag(s) on Shodan's InternetDB">{{ip.shodan.tags|join(', ')}}</span>{% endif %}
     {% endif %}
   </td>
-  <td>
-    <div class="threat_category_preview">
-      {% if ip._threat_category_table %}
-        <table>
-        {% for row in ip._threat_category_table %}
-            <tr class="threat_category_tooltip" title="{{ row[3] }}">
-            {% for col in row[:3] %}
-                <td>{{ col|safe }}</td>
-            {% endfor %}
-            </tr>
-        {% endfor %}
-        </table>
-      {% endif %}
-    </div>
-  </td>
   <td class="time" {% if ip.ts_added %}data-time={{ ip.ts_added|date_to_int }}{% endif %}>{{ip.ts_added.strftime("%Y-%m-%d %H:%M:%S") if ip.ts_added else "--"}}</td>
   <td {% if ip.last_activity %}class="time" data-time={{ ip.last_activity|date_to_int }}{% endif %}>{{ip.last_activity.strftime("%Y-%m-%d %H:%M:%S") if ip.last_activity else "--"}}</td>
   <td class="links">
diff --git a/common/config.py b/common/config.py
index aff7d800..e79f007f 100644
--- a/common/config.py
+++ b/common/config.py
@@ -21,15 +21,20 @@ def hierarchical_get(self, key, default=NoDefault):
     instead.
     """
     d = self
+    full_key = key
     try:
         while '.' in key:
             first_key, key = key.split('.', 1)
             d = d[first_key]
-        return d[key]
+        result = d[key]
+        if isinstance(result, dict):
+            return HierarchicalDict(result)
+        else:
+            return result
     except (KeyError, TypeError):
         pass # not found - continue below
     if default is NoDefault:
-        raise MissingConfigError("Mandatory configuration element is missing: " + key)
+        raise MissingConfigError("Mandatory configuration element is missing: " + full_key)
     else:
         return default
 
diff --git a/common/threat_categorization.py b/common/threat_categorization.py
index c0fe2c73..f6a4778e 100644
--- a/common/threat_categorization.py
+++ b/common/threat_categorization.py
@@ -47,13 +47,14 @@ def init_warden_receiver(self, event, source):
         self.ip_info = source.get('Note', "")
         self.source_types = source.get('Type', [])
         target_ports = []
-        protocols = source.get('Proto', [])
+        protocols = []
+        for source in event.get('Source', []):
+            protocols += source.get('Proto', [])
         for target in event.get('Target', []):
             target_ports += target.get('Port', [])
             protocols += target.get('Proto', [])
         self.target_ports = [str(port) for port in set(target_ports)]
-        # Protocol list often contains 'tcp'/'udp', but we only want L7 protocol here -> remove the two
-        self.protocols = list(set(protocols) - {'tcp', 'udp'})
+        self.protocols = list(set(protocols) - {'tcp', 'udp'}) # don't include L4 protocols (TCP, UDP), keep just the application layer ones
 
     def init_otx_receiver(self, pulse):
         """
diff --git a/etc/threat_categorization.yml b/etc/threat_categorization.yml
index 9b925cf4..e6155df8 100644
--- a/etc/threat_categorization.yml
+++ b/etc/threat_categorization.yml
@@ -2,6 +2,10 @@
 # Used for malware subcategory classification
 malpedia_family_list_path: "/data/malpedia/malware_families.yml"
 
+# Colors for roles (src/dst) used in tags in web UI
+role_colors:
+  src: "#ffaa66" # light orange
+  dst: "#ff595b" # light red
 
 # Threat categorization
 # Structure:
@@ -10,6 +14,7 @@ malpedia_family_list_path: "/data/malpedia/malware_families.yml"
 #   - role: IP role (src/dst) that will be assigned along with the main category
 #   - label: Displayed name of the category
 #   - description: General description of the category
+#   - color: Color of the category tag in web UI (any CSS-compatible format)
 #   - subcategories: List of required subcategories (port, protocol, malware_family)
 #           Supported subcategories:
 #           - port
@@ -41,16 +46,18 @@ malpedia_family_list_path: "/data/malpedia/malware_families.yml"
 #             - protocols: List of protocols used by the IP
 #             - target_ports: List of target ports used by the IP
 
-threat_categorization:
+threat_categories:
   unknown:
     role: src
-    description: The IP was reported as a source of malicious/unexpected/rouge packets, but without any further specification.
+    description: The IP was reported as a source of malicious/unexpected/rogue packets, but without any further specification.
     label: Unknown
+    color: "#cccccc"
 
   scan:
     role: src
-    description: The IP address performs a common network scanning, i.e. it tries to connect to various targets to search for open ports/services.
+    description: The IP address performs network scanning, i.e. it tries to connect to various targets to search for open ports/services.
     label: Scanning
+    color: "#aaffff"
     subcategories:
       - port
     triggers:
@@ -70,10 +77,11 @@ threat_categorization:
         event.ip_info == 'crowdsecurity/http-probing'
         event.ip_info == 'crowdsecurity/http-sensitive-files'
 
-  bruteforce:
+  login:
     role: src
-    description: The IP performs dictionary (or bruteforce) attacks on password-protected services. Usually accompanied with scanning - searching for the targeted service.
-    label: Bruteforce
+    description: The IP tries to access password-protected services without authorization (e.g. dictionary or bruteforce attacks, login attempts on honeypots).
+    label: Login attempts
+    color: "#55ddaa"
     subcategories:
       - protocol
       - port
@@ -113,6 +121,7 @@ threat_categorization:
     role: src
     description: The IP has been observed as a source of volumetric (D)DoS attacks.
     label: DDoS
+    color: "#c44a48"
     triggers:
       general: |-
         bool(re.findall(r'(?i)http.*flood', event.ip_info + event.description)) -> {'protocol': ['http']}
@@ -133,6 +142,7 @@ threat_categorization:
     role: dst
     description: The IP runs a service which can be (and often is) misused as an amplifier for DDoS attacks, e.g. open DNS resolvers, NTP servers, memcached, etc.
     label: DDoS amplifier
+    color: "#ff9769"
     subcategories:
       - protocol
     triggers:
@@ -153,6 +163,7 @@ threat_categorization:
     role: src
     description: The IP is sending spam.
     label: Spam
+    color: "#2255bb"
     triggers:
       general: |-
         bool(re.findall(r'(?i)send.*spam', event.ip_info + event.description))
@@ -170,8 +181,9 @@ threat_categorization:
 
   malware_distribution:
     role: dst
-    description: The IP is used to distribute a malware, e.g. hosts an HTTP URL from which a malware is being downloaded.
+    description: The IP is used to distribute malware (e.g. hosts an HTTP URL from which a malware is being downloaded).
     label: Malware distribution
+    color: "#dd77ff"
     subcategories:
       - malware_family
     triggers:
@@ -190,8 +202,9 @@ threat_categorization:
 
   cc:
     role: dst
-    description: The IP is used as Command&Control server for a botnet/malware.
+    description: The IP is used as a Command&Control server for a botnet/malware.
     label: Command and control
+    color: "#ff77dd"
     subcategories:
       - malware_family
     triggers:
@@ -218,6 +231,7 @@ threat_categorization:
     role: src
     description: The IP is acting as a bot/drone of a botnet.
     label: Botnet drone
+    color: "#d090d0"
     subcategories:
       - malware_family
     triggers:
@@ -236,6 +250,7 @@ threat_categorization:
     role: dst
     description: The IP is hosting a phishing website.
     label: Phishing site
+    color: "#dddd00"
     triggers:
       general: |-
         bool(re.findall(r'(?i)phishing.*site', event.ip_info + event.description))
@@ -251,6 +266,7 @@ threat_categorization:
     role: src
     description: The IP is attempting to exploit known vulnerabilities.
     label: Exploit
+    color: "#22ee88"
     subcategories:
       - protocol
     triggers:
@@ -270,4 +286,4 @@ threat_categorization:
         bool(re.findall(r'(?i)CVE[-_]20', event.ip_info))
         event.ip_info == 'http-sqli-probing' -> {'protocol': ['http']}
         event.ip_info == 'http-xss-probing' -> {'protocol': ['http']}
-        event.ip_info == 'http-backdoors-attempts' -> {'protocol': ['http']}
\ No newline at end of file
+        event.ip_info == 'http-backdoors-attempts' -> {'protocol': ['http']}
diff --git a/scripts/generate_ip_category_files.py b/scripts/generate_ip_category_files.py
index 57e39361..c64ea5f4 100644
--- a/scripts/generate_ip_category_files.py
+++ b/scripts/generate_ip_category_files.py
@@ -32,7 +32,7 @@
 
 # read categorization config
 config = read_config(args.cfg_file)
-categories = [cat for cat in config.get('threat_categorization')]
+categories = [cat for cat in config.get('threat_categories')]
 categories.remove("unknown")
 
 # bash script used to execute the DB query