diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
index 2698824..5757996 100644
--- a/nbs/00_core.ipynb
+++ b/nbs/00_core.ipynb
@@ -193,7 +193,9 @@
     "                elif output_type == \"json\":\n",
     "                    datasource[\"dataset\"].to_json(filepath, index=False)\n",
     "                else:\n",
-    "                    logger.error(f\"Invalid output_type: {output_type}. Skipping export for {name} dataset.\")\n",
+    "                    logger.error(\n",
+    "                        f\"Invalid output_type: {output_type}. Skipping export for {name} dataset.\"\n",
+    "                    )\n",
     "                logger.info(f\"Exported {name} dataset to {filepath}\")\n",
     "            except Exception as e:\n",
     "                logger.error(\n",
diff --git a/nbs/01_filter.ipynb b/nbs/01_filter.ipynb
index 0daa40a..4164137 100644
--- a/nbs/01_filter.ipynb
+++ b/nbs/01_filter.ipynb
@@ -895,7 +895,9 @@
     "    threshold: float = 0.85,  # The threshold to use for calculating the false positive rate.\n",
     "    column: str = \"content\",  # The column to use for calculating the false positive rate.\n",
     "    verbose: bool = False,\n",
-    ") -> Set:  # The set of duplicate ids that should be removed, leaving only one id in each community.\n",
+    ") -> (\n",
+    "    Set\n",
+    "):  # The set of duplicate ids that should be removed, leaving only one id in each community.\n",
     "    \"\"\"\n",
     "    Find the duplicate communities from the queried dataset.\n",
     "    \"\"\"\n",
diff --git a/nbs/02_clean.ipynb b/nbs/02_clean.ipynb
index 4022fb7..ba8ccde 100644
--- a/nbs/02_clean.ipynb
+++ b/nbs/02_clean.ipynb
@@ -469,6 +469,353 @@
     "assert fix_utf8_encoding(bad_text) == \"PÉREZ\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | export\n",
+    "def clean_code_license(\n",
+    "    code: str,  # The code to clean\n",
+    "    language: str = \"python\",  # The language of the code\n",
+    "    min_lines: int = 3,  # The minimum number of lines that need to be removed\n",
+    "):\n",
+    "    import code_ast\n",
+    "    from code_ast import ASTVisitor\n",
+    "    from code_ast.ast import LEAVE_WHITELIST\n",
+    "\n",
+    "    class FirstNonCommentVisitor(ASTVisitor):\n",
+    "        def __init__(self):\n",
+    "            self.passed_global_node = False\n",
+    "            self.first_node = None\n",
+    "\n",
+    "        def visit(self, node):\n",
+    "            if not self.passed_global_node:\n",
+    "                self.passed_global_node = True\n",
+    "                return\n",
+    "            if self.first_node is None:\n",
+    "                if node.child_count > 0 or node.type in LEAVE_WHITELIST:\n",
+    "                    self.first_node = node\n",
+    "\n",
+    "    \"\"\"Remove the license or other boilerplate comments from the code.\"\"\"\n",
+    "    ast = code_ast.ast(code, lang=language)\n",
+    "    visitor = FirstNonCommentVisitor()\n",
+    "    ast.visit(visitor)\n",
+    "    start_line = visitor.first_node.start_point[0]\n",
+    "    if start_line < min_lines:\n",
+    "        return code\n",
+    "    else:\n",
+    "        return \"\\n\".join(code.splitlines()[start_line:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# |eval: false\n",
+    "# Test the cleaning of code licenses or similar boilerplate comments from code\n",
+    "code_python = \"\"\"# -*- coding: utf-8 -*-\n",
+    "\n",
+    "# Copyright 2018 Spanish National Research Council (CSIC)\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\"); you may\n",
+    "# not use this file except in compliance with the License. You may obtain\n",
+    "# a copy of the License at\n",
+    "#\n",
+    "#      http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n",
+    "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n",
+    "# License for the specific language governing permissions and limitations\n",
+    "# under the License.\n",
+    "\n",
+    "\\\"\\\"\\\"\n",
+    "Given two dates and region, download N Sentinel Collections scenes from ESA\n",
+    "Sentinel dataHUB.\n",
+    "The downloaded Sentinel collection scenes are compatible with:\n",
+    "S2MSI1C: Top-of-atmosphere reflectances in cartographic geometry\n",
+    "or S2MSI2A: Bottom-of-atmosphere reflectance in cartographic geometry\n",
+    "Parameters\n",
+    "----------\n",
+    "inidate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n",
+    "enddate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n",
+    "region: name of one reservoir saved in the \"coord_reservoirs.json\" file\n",
+    "coordinates : dict. Coordinates of the region to search.\n",
+    "Example: {\"W\": -2.830, \"S\": 41.820, \"E\": -2.690, \"N\": 41.910}}\n",
+    "platform : str. Satellite to use from the Sentinel family\n",
+    "producttype : str. Dataset type.\n",
+    "cloud: int\n",
+    "path : path\n",
+    "Author: Daniel García Díaz\n",
+    "Email: garciad@ifca.unican.es\n",
+    "Institute of Physics of Cantabria (IFCA)\n",
+    "Advanced Computing and e-Science\n",
+    "Date: Sep 2018\n",
+    "\\\"\\\"\\\"\n",
+    "#imports apis\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "# Subfunctions\n",
+    "from wq_sat.utils import config\n",
+    "\"\"\"\n",
+    "\n",
+    "code_go = \"\"\"// +build go1.9\n",
+    "\n",
+    "// Copyright 2019 Microsoft Corporation\n",
+    "//\n",
+    "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "// you may not use this file except in compliance with the License.\n",
+    "// You may obtain a copy of the License at\n",
+    "//\n",
+    "//     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "//\n",
+    "// Unless required by applicable law or agreed to in writing, software\n",
+    "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "// See the License for the specific language governing permissions and\n",
+    "// limitations under the License.\n",
+    "\n",
+    "// This code was auto-generated by:\n",
+    "// github.com/Azure/azure-sdk-for-go/tools/profileBuilder\n",
+    "\n",
+    "package policyinsights\n",
+    "\n",
+    "import (\n",
+    "\t\"context\"\n",
+    "\n",
+    "\toriginal \"github.com/Azure/azure-sdk-for-go/services/policyinsights/mgmt/2019-10-01/policyinsights\"\n",
+    ")\n",
+    "\"\"\"\n",
+    "\n",
+    "code_c = \"\"\"/*\n",
+    " * copyright (c) 2008 - 2011 Espressif System\n",
+    " *\n",
+    " * Define user specified Event signals and Task priorities here\n",
+    " *\n",
+    " */\n",
+    "\n",
+    "#ifndef _ETS_SYS_H\n",
+    "#define _ETS_SYS_H\n",
+    "\n",
+    "#include \"c_types.h\"\n",
+    "#include \"eagle_soc.h\"\n",
+    "\n",
+    "typedef uint32_t ETSSignal;\n",
+    "\"\"\"\n",
+    "\n",
+    "code_cpp = \"\"\"/*  Pokemon Automation Bot Base - Client Example\n",
+    "\n",
+    " * \n",
+    "\n",
+    " *  From: https://github.com/PokemonAutomation/Arduino-Source\n",
+    "\n",
+    " * \n",
+    "\n",
+    " */\n",
+    "\n",
+    "\n",
+    "\n",
+    "#include \"Common/CRC32.h\"\n",
+    "\n",
+    "#include \"Common/Microcontroller/MessageProtocol.h\"\n",
+    "\n",
+    "#include \"ClientSource/Libraries/Logging.h\"\n",
+    "\n",
+    "#include \"ClientSource/Libraries/MessageConverter.h\"\n",
+    "\n",
+    "#include \"BotBaseMessage.h\"\n",
+    "\n",
+    "#include \"PABotBaseConnection.h\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "#include \n",
+    "\n",
+    "using std::cout;\n",
+    "\"\"\"\n",
+    "\n",
+    "code_java = \"\"\"/*\n",
+    " * Copyright (C) 2012-2021 DuyHai DOAN\n",
+    " *\n",
+    " * Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    " * you may not use this file except in compliance with the License.\n",
+    " * You may obtain a copy of the License at\n",
+    " *\n",
+    " * http://www.apache.org/licenses/LICENSE-2.0\n",
+    " *\n",
+    " * Unless required by applicable law or agreed to in writing, software\n",
+    " * distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    " * See the License for the specific language governing permissions and\n",
+    " * limitations under the License.\n",
+    " */\n",
+    "\n",
+    "package info.archinnov.achilles.internals.sample_classes.parser.entity;\n",
+    "\n",
+    "import info.archinnov.achilles.annotations.Column;\n",
+    "\"\"\"\n",
+    "\n",
+    "code_javascript = \"\"\"/*\n",
+    "** Copyright (c) 2016-2019, Thomas Farr\n",
+    "**\n",
+    "** This Source Code Form is subject to the terms of the Mozilla Public\n",
+    "** License, v. 2.0. If a copy of the MPL was not distributed with this\n",
+    "** file, You can obtain one at https://mozilla.org/MPL/2.0/.\n",
+    "*/\n",
+    "\n",
+    "// TODO: Implement testing of option handling, and filename arrays\n",
+    "\n",
+    "const anitomy = require('../anitomy');\n",
+    "const async = require('async');\n",
+    "\"\"\"\n",
+    "\n",
+    "cleaned_code_python = clean_code_license(code_python, language=\"python\")\n",
+    "cleaned_code_go = clean_code_license(code_go, language=\"go\")\n",
+    "cleaned_code_c = clean_code_license(code_c, language=\"c\")\n",
+    "cleaned_code_cpp = clean_code_license(code_cpp, language=\"cpp\")\n",
+    "cleaned_code_java = clean_code_license(code_java, language=\"java\")\n",
+    "cleaned_code_javascript = clean_code_license(code_javascript, language=\"javascript\")\n",
+    "\n",
+    "assert (\n",
+    "    cleaned_code_python\n",
+    "    == \"\"\"\\\"\\\"\\\"\n",
+    "Given two dates and region, download N Sentinel Collections scenes from ESA\n",
+    "Sentinel dataHUB.\n",
+    "The downloaded Sentinel collection scenes are compatible with:\n",
+    "S2MSI1C: Top-of-atmosphere reflectances in cartographic geometry\n",
+    "or S2MSI2A: Bottom-of-atmosphere reflectance in cartographic geometry\n",
+    "Parameters\n",
+    "----------\n",
+    "inidate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n",
+    "enddate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n",
+    "region: name of one reservoir saved in the \"coord_reservoirs.json\" file\n",
+    "coordinates : dict. Coordinates of the region to search.\n",
+    "Example: {\"W\": -2.830, \"S\": 41.820, \"E\": -2.690, \"N\": 41.910}}\n",
+    "platform : str. Satellite to use from the Sentinel family\n",
+    "producttype : str. Dataset type.\n",
+    "cloud: int\n",
+    "path : path\n",
+    "Author: Daniel García Díaz\n",
+    "Email: garciad@ifca.unican.es\n",
+    "Institute of Physics of Cantabria (IFCA)\n",
+    "Advanced Computing and e-Science\n",
+    "Date: Sep 2018\n",
+    "\\\"\\\"\\\"\n",
+    "#imports apis\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "# Subfunctions\n",
+    "from wq_sat.utils import config\"\"\"\n",
+    ")\n",
+    "assert (\n",
+    "    cleaned_code_go\n",
+    "    == \"\"\"package policyinsights\n",
+    "\n",
+    "import (\n",
+    "\t\"context\"\n",
+    "\n",
+    "\toriginal \"github.com/Azure/azure-sdk-for-go/services/policyinsights/mgmt/2019-10-01/policyinsights\"\n",
+    ")\"\"\"\n",
+    ")\n",
+    "assert (\n",
+    "    cleaned_code_c\n",
+    "    == \"\"\"#ifndef _ETS_SYS_H\n",
+    "#define _ETS_SYS_H\n",
+    "\n",
+    "#include \"c_types.h\"\n",
+    "#include \"eagle_soc.h\"\n",
+    "\n",
+    "typedef uint32_t ETSSignal;\"\"\"\n",
+    ")\n",
+    "assert (\n",
+    "    cleaned_code_cpp\n",
+    "    == \"\"\"#include \"Common/CRC32.h\"\n",
+    "\n",
+    "#include \"Common/Microcontroller/MessageProtocol.h\"\n",
+    "\n",
+    "#include \"ClientSource/Libraries/Logging.h\"\n",
+    "\n",
+    "#include \"ClientSource/Libraries/MessageConverter.h\"\n",
+    "\n",
+    "#include \"BotBaseMessage.h\"\n",
+    "\n",
+    "#include \"PABotBaseConnection.h\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "#include \n",
+    "\n",
+    "using std::cout;\"\"\"\n",
+    ")\n",
+    "assert (\n",
+    "    cleaned_code_java\n",
+    "    == \"\"\"package info.archinnov.achilles.internals.sample_classes.parser.entity;\n",
+    "\n",
+    "import info.archinnov.achilles.annotations.Column;\"\"\"\n",
+    ")\n",
+    "assert (\n",
+    "    cleaned_code_javascript\n",
+    "    == \"\"\"const anitomy = require('../anitomy');\n",
+    "const async = require('async');\"\"\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test the nsfw_word.csv generator\n",
+    "import csv\n",
+    "\n",
+    "nsfw_words = [\n",
+    "    'word1',\n",
+    "    'word2',\n",
+    "    'word3',\n",
+    "    'word4',\n",
+    "    'word5',\n",
+    "    'word6'\n",
+    "]\n",
+    "\n",
+    "csv_file_path = 'nsfw_words.csv'\n",
+    "\n",
+    "with open(csv_file_path, 'w', newline='') as file:\n",
+    "    writer = csv.writer(file)\n",
+    "    writer.writerow(['Words'])\n",
+    "    writer.writerows([[word] for word in nsfw_words])\n",
+    "\n",
+    "print(f\"CSV file '{csv_file_path}' generated successfully.\")\n",
+    "\n",
+    "##################\n",
+    "\n",
+    "# test the replace_nsfw function\n",
+    "def replace_nsfw(\n",
+    "    text: str,  # The text to replace NSFW words in\n",
+    "    nsfw_words_csv: str,  # Path to the CSV file containing list of NSFW words\n",
+    ") -> str:  # The text return to be replaced\n",
+    "    \"\"\"Replace NSFW words from text with an empty string.\"\"\"\n",
+    "    try:\n",
+    "      with open(nsfw_words_csv, 'r') as file:\n",
+    "            reader = csv.reader(file)\n",
+    "            nsfw_words = [row[0] for row in reader][1:]  # Exclude the header row\n",
+    "    except (FileNotFoundError, IndexError) as e:\n",
+    "        print(f\"Error! NSFW dictionary could not be found or indexed: {e}\")\n",
+    "        return text\n",
+    "    \n",
+    "    for word in nsfw_words:\n",
+    "        pattern = r\"\\b\" + re.escape(word) + r\"\\b\\s*\"\n",
+    "        text = re.sub(pattern, \"\", text, flags=re.IGNORECASE)\n",
+    "    \n",
+    "    return text"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/settings.ini b/settings.ini
index a4df10e..8ec35c0 100644
--- a/settings.ini
+++ b/settings.ini
@@ -26,7 +26,7 @@ language = English
 status = 3
 user = CarperAI
 requirements = datasketch==1.5.8 datasets==2.7.1 Faker==15.3.3 fastcore huggingface-hub networkit pydantic rich ftfy scikit-learn
-dev_requirements = BeautifulSoup4 fasttext nbdev scrubadub twine sentencepiece code-tokenize langchain==0.0.212 openai
+dev_requirements = BeautifulSoup4 fasttext nbdev scrubadub twine sentencepiece code-tokenize langchain==0.0.212 openai code-ast
 black_formatting = False
 readme_nb = index.ipynb
 allowed_metadata_keys = 
diff --git a/squeakily/_modidx.py b/squeakily/_modidx.py
index fd9e618..0f2dd2b 100644
--- a/squeakily/_modidx.py
+++ b/squeakily/_modidx.py
@@ -5,7 +5,8 @@
                 'doc_host': 'https://CarperAI.github.io',
                 'git_url': 'https://github.com/CarperAI/squeakily',
                 'lib_path': 'squeakily'},
-  'syms': { 'squeakily.clean': { 'squeakily.clean.fix_utf8_encoding': ('clean.html#fix_utf8_encoding', 'squeakily/clean.py'),
+  'syms': { 'squeakily.clean': { 'squeakily.clean.clean_code_license': ('clean.html#clean_code_license', 'squeakily/clean.py'),
+                                 'squeakily.clean.fix_utf8_encoding': ('clean.html#fix_utf8_encoding', 'squeakily/clean.py'),
                                  'squeakily.clean.normalize_punctuation': ('clean.html#normalize_punctuation', 'squeakily/clean.py'),
                                  'squeakily.clean.normalize_whitespace': ('clean.html#normalize_whitespace', 'squeakily/clean.py'),
                                  'squeakily.clean.remove_empty_lines': ('clean.html#remove_empty_lines', 'squeakily/clean.py'),
@@ -15,6 +16,7 @@
                                  'squeakily.clean.replace_ip': ('clean.html#replace_ip', 'squeakily/clean.py'),
                                  'squeakily.clean.replace_phone': ('clean.html#replace_phone', 'squeakily/clean.py'),
                                  'squeakily.clean.replace_ssn': ('clean.html#replace_ssn', 'squeakily/clean.py'),
+                                 'squeakily.clean.replace_nsfw': ('clean.html#replace_nsfw', 'squeakily/clean.py'),
                                  'squeakily.clean.replace_urls': ('clean.html#replace_urls', 'squeakily/clean.py')},
             'squeakily.core': { 'squeakily.core.Pipeline': ('core.html#pipeline', 'squeakily/core.py'),
                                 'squeakily.core.Pipeline.__init__': ('core.html#pipeline.__init__', 'squeakily/core.py'),
diff --git a/squeakily/clean.py b/squeakily/clean.py
index 7eb5ad9..982dcef 100644
--- a/squeakily/clean.py
+++ b/squeakily/clean.py
@@ -3,10 +3,11 @@
 # %% auto 0
 __all__ = ['fake', 'whitespace', 'unicode_punctuation', 'normalize_whitespace', 'normalize_punctuation', 'remove_empty_lines',
            'replace_urls', 'replace_dates', 'replace_email', 'replace_phone', 'replace_ip', 'replace_credit_card',
-           'replace_ssn', 'fix_utf8_encoding']
+           'replace_ssn', 'fix_utf8_encoding', 'clean_code_license', 'replace_nsfw']
 
 # %% ../nbs/02_clean.ipynb 2
 import re
+import csv
 from faker import Faker
 import ftfy
 
@@ -169,3 +170,57 @@ def fix_utf8_encoding(
 ) -> str:  # The fixed text
     """Fix utf8 text using ftfy."""
     return ftfy.fix_text(text)
+
+# %% ../nbs/02_clean.ipynb 27
+def clean_code_license(
+    code: str,  # The code to clean
+    language: str = "python",  # The language of the code
+    min_lines: int = 3,  # The minimum number of lines that need to be removed
+):
+    import code_ast
+    from code_ast import ASTVisitor
+    from code_ast.ast import LEAVE_WHITELIST
+
+    class FirstNonCommentVisitor(ASTVisitor):
+        def __init__(self):
+            self.passed_global_node = False
+            self.first_node = None
+
+        def visit(self, node):
+            if not self.passed_global_node:
+                self.passed_global_node = True
+                return
+            if self.first_node is None:
+                if node.child_count > 0 or node.type in LEAVE_WHITELIST:
+                    self.first_node = node
+
+    """Remove the license or other boilerplate comments from the code."""
+    ast = code_ast.ast(code, lang=language)
+    visitor = FirstNonCommentVisitor()
+    ast.visit(visitor)
+    start_line = visitor.first_node.start_point[0]
+    if start_line < min_lines:
+        return code
+    else:
+        return "\n".join(code.splitlines()[start_line:])
+
+# %% ../nbs/02_clean.ipynb 29
+def replace_nsfw(
+    text: str,  # The text to replace NSFW words in
+    nsfw_words_csv: str,  # Path to the CSV file containing list of NSFW words
+) -> str:  # The text return to be replaced
+    """Replace NSFW words from text with an empty string."""
+    try:
+      with open(nsfw_words_csv, 'r') as file:
+            reader = csv.reader(file)
+            nsfw_words = [row[0] for row in reader][1:]  # Exclude the header row
+    except (FileNotFoundError, IndexError) as e:
+        print(f"Error! NSFW dictionary could not be found or indexed: {e}")
+        return text
+    
+    for word in nsfw_words:
+        pattern = r"\b" + re.escape(word) + r"\b\s*"
+        text = re.sub(pattern, "", text, flags=re.IGNORECASE)
+    
+    return text
+# %%