diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb index 2698824..5757996 100644 --- a/nbs/00_core.ipynb +++ b/nbs/00_core.ipynb @@ -193,7 +193,9 @@ " elif output_type == \"json\":\n", " datasource[\"dataset\"].to_json(filepath, index=False)\n", " else:\n", - " logger.error(f\"Invalid output_type: {output_type}. Skipping export for {name} dataset.\")\n", + " logger.error(\n", + " f\"Invalid output_type: {output_type}. Skipping export for {name} dataset.\"\n", + " )\n", " logger.info(f\"Exported {name} dataset to {filepath}\")\n", " except Exception as e:\n", " logger.error(\n", diff --git a/nbs/01_filter.ipynb b/nbs/01_filter.ipynb index 0daa40a..4164137 100644 --- a/nbs/01_filter.ipynb +++ b/nbs/01_filter.ipynb @@ -895,7 +895,9 @@ " threshold: float = 0.85, # The threshold to use for calculating the false positive rate.\n", " column: str = \"content\", # The column to use for calculating the false positive rate.\n", " verbose: bool = False,\n", - ") -> Set: # The set of duplicate ids that should be removed, leaving only one id in each community.\n", + ") -> (\n", + " Set\n", + "): # The set of duplicate ids that should be removed, leaving only one id in each community.\n", " \"\"\"\n", " Find the duplicate communities from the queried dataset.\n", " \"\"\"\n", diff --git a/nbs/02_clean.ipynb b/nbs/02_clean.ipynb index 4022fb7..ba8ccde 100644 --- a/nbs/02_clean.ipynb +++ b/nbs/02_clean.ipynb @@ -469,6 +469,353 @@ "assert fix_utf8_encoding(bad_text) == \"PÉREZ\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# | export\n", + "def clean_code_license(\n", + " code: str, # The code to clean\n", + " language: str = \"python\", # The language of the code\n", + " min_lines: int = 3, # The minimum number of lines that need to be removed\n", + "):\n", + " import code_ast\n", + " from code_ast import ASTVisitor\n", + " from code_ast.ast import LEAVE_WHITELIST\n", + "\n", + " class FirstNonCommentVisitor(ASTVisitor):\n", + " def __init__(self):\n", + " self.passed_global_node = False\n", + " self.first_node = None\n", + "\n", + " def visit(self, node):\n", + " if not self.passed_global_node:\n", + " self.passed_global_node = True\n", + " return\n", + " if self.first_node is None:\n", + " if node.child_count > 0 or node.type in LEAVE_WHITELIST:\n", + " self.first_node = node\n", + "\n", + " \"\"\"Remove the license or other boilerplate comments from the code.\"\"\"\n", + " ast = code_ast.ast(code, lang=language)\n", + " visitor = FirstNonCommentVisitor()\n", + " ast.visit(visitor)\n", + " start_line = visitor.first_node.start_point[0]\n", + " if start_line < min_lines:\n", + " return code\n", + " else:\n", + " return \"\\n\".join(code.splitlines()[start_line:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |eval: false\n", + "# Test the cleaning of code licenses or similar boilerplate comments from code\n", + "code_python = \"\"\"# -*- coding: utf-8 -*-\n", + "\n", + "# Copyright 2018 Spanish National Research Council (CSIC)\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\"); you may\n", + "# not use this file except in compliance with the License. You may obtain\n", + "# a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n", + "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n", + "# License for the specific language governing permissions and limitations\n", + "# under the License.\n", + "\n", + "\\\"\\\"\\\"\n", + "Given two dates and region, download N Sentinel Collections scenes from ESA\n", + "Sentinel dataHUB.\n", + "The downloaded Sentinel collection scenes are compatible with:\n", + "S2MSI1C: Top-of-atmosphere reflectances in cartographic geometry\n", + "or S2MSI2A: Bottom-of-atmosphere reflectance in cartographic geometry\n", + "Parameters\n", + "----------\n", + "inidate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n", + "enddate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n", + "region: name of one reservoir saved in the \"coord_reservoirs.json\" file\n", + "coordinates : dict. Coordinates of the region to search.\n", + "Example: {\"W\": -2.830, \"S\": 41.820, \"E\": -2.690, \"N\": 41.910}}\n", + "platform : str. Satellite to use from the Sentinel family\n", + "producttype : str. Dataset type.\n", + "cloud: int\n", + "path : path\n", + "Author: Daniel García Díaz\n", + "Email: garciad@ifca.unican.es\n", + "Institute of Physics of Cantabria (IFCA)\n", + "Advanced Computing and e-Science\n", + "Date: Sep 2018\n", + "\\\"\\\"\\\"\n", + "#imports apis\n", + "import requests\n", + "import os\n", + "\n", + "# Subfunctions\n", + "from wq_sat.utils import config\n", + "\"\"\"\n", + "\n", + "code_go = \"\"\"// +build go1.9\n", + "\n", + "// Copyright 2019 Microsoft Corporation\n", + "//\n", + "// Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "// you may not use this file except in compliance with the License.\n", + "// You may obtain a copy of the License at\n", + "//\n", + "// http://www.apache.org/licenses/LICENSE-2.0\n", + "//\n", + "// Unless required by applicable law or agreed to in writing, software\n", + "// distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "// See the License for the specific language governing permissions and\n", + "// limitations under the License.\n", + "\n", + "// This code was auto-generated by:\n", + "// github.com/Azure/azure-sdk-for-go/tools/profileBuilder\n", + "\n", + "package policyinsights\n", + "\n", + "import (\n", + "\t\"context\"\n", + "\n", + "\toriginal \"github.com/Azure/azure-sdk-for-go/services/policyinsights/mgmt/2019-10-01/policyinsights\"\n", + ")\n", + "\"\"\"\n", + "\n", + "code_c = \"\"\"/*\n", + " * copyright (c) 2008 - 2011 Espressif System\n", + " *\n", + " * Define user specified Event signals and Task priorities here\n", + " *\n", + " */\n", + "\n", + "#ifndef _ETS_SYS_H\n", + "#define _ETS_SYS_H\n", + "\n", + "#include \"c_types.h\"\n", + "#include \"eagle_soc.h\"\n", + "\n", + "typedef uint32_t ETSSignal;\n", + "\"\"\"\n", + "\n", + "code_cpp = \"\"\"/* Pokemon Automation Bot Base - Client Example\n", + "\n", + " * \n", + "\n", + " * From: https://github.com/PokemonAutomation/Arduino-Source\n", + "\n", + " * \n", + "\n", + " */\n", + "\n", + "\n", + "\n", + "#include \"Common/CRC32.h\"\n", + "\n", + "#include \"Common/Microcontroller/MessageProtocol.h\"\n", + "\n", + "#include \"ClientSource/Libraries/Logging.h\"\n", + "\n", + "#include \"ClientSource/Libraries/MessageConverter.h\"\n", + "\n", + "#include \"BotBaseMessage.h\"\n", + "\n", + "#include \"PABotBaseConnection.h\"\n", + "\n", + "\n", + "\n", + "#include \n", + "\n", + "using std::cout;\n", + "\"\"\"\n", + "\n", + "code_java = \"\"\"/*\n", + " * Copyright (C) 2012-2021 DuyHai DOAN\n", + " *\n", + " * Licensed under the Apache License, Version 2.0 (the \"License\");\n", + " * you may not use this file except in compliance with the License.\n", + " * You may obtain a copy of the License at\n", + " *\n", + " * http://www.apache.org/licenses/LICENSE-2.0\n", + " *\n", + " * Unless required by applicable law or agreed to in writing, software\n", + " * distributed under the License is distributed on an \"AS IS\" BASIS,\n", + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + " * See the License for the specific language governing permissions and\n", + " * limitations under the License.\n", + " */\n", + "\n", + "package info.archinnov.achilles.internals.sample_classes.parser.entity;\n", + "\n", + "import info.archinnov.achilles.annotations.Column;\n", + "\"\"\"\n", + "\n", + "code_javascript = \"\"\"/*\n", + "** Copyright (c) 2016-2019, Thomas Farr\n", + "**\n", + "** This Source Code Form is subject to the terms of the Mozilla Public\n", + "** License, v. 2.0. If a copy of the MPL was not distributed with this\n", + "** file, You can obtain one at https://mozilla.org/MPL/2.0/.\n", + "*/\n", + "\n", + "// TODO: Implement testing of option handling, and filename arrays\n", + "\n", + "const anitomy = require('../anitomy');\n", + "const async = require('async');\n", + "\"\"\"\n", + "\n", + "cleaned_code_python = clean_code_license(code_python, language=\"python\")\n", + "cleaned_code_go = clean_code_license(code_go, language=\"go\")\n", + "cleaned_code_c = clean_code_license(code_c, language=\"c\")\n", + "cleaned_code_cpp = clean_code_license(code_cpp, language=\"cpp\")\n", + "cleaned_code_java = clean_code_license(code_java, language=\"java\")\n", + "cleaned_code_javascript = clean_code_license(code_javascript, language=\"javascript\")\n", + "\n", + "assert (\n", + " cleaned_code_python\n", + " == \"\"\"\\\"\\\"\\\"\n", + "Given two dates and region, download N Sentinel Collections scenes from ESA\n", + "Sentinel dataHUB.\n", + "The downloaded Sentinel collection scenes are compatible with:\n", + "S2MSI1C: Top-of-atmosphere reflectances in cartographic geometry\n", + "or S2MSI2A: Bottom-of-atmosphere reflectance in cartographic geometry\n", + "Parameters\n", + "----------\n", + "inidate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n", + "enddate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n", + "region: name of one reservoir saved in the \"coord_reservoirs.json\" file\n", + "coordinates : dict. Coordinates of the region to search.\n", + "Example: {\"W\": -2.830, \"S\": 41.820, \"E\": -2.690, \"N\": 41.910}}\n", + "platform : str. Satellite to use from the Sentinel family\n", + "producttype : str. Dataset type.\n", + "cloud: int\n", + "path : path\n", + "Author: Daniel García Díaz\n", + "Email: garciad@ifca.unican.es\n", + "Institute of Physics of Cantabria (IFCA)\n", + "Advanced Computing and e-Science\n", + "Date: Sep 2018\n", + "\\\"\\\"\\\"\n", + "#imports apis\n", + "import requests\n", + "import os\n", + "\n", + "# Subfunctions\n", + "from wq_sat.utils import config\"\"\"\n", + ")\n", + "assert (\n", + " cleaned_code_go\n", + " == \"\"\"package policyinsights\n", + "\n", + "import (\n", + "\t\"context\"\n", + "\n", + "\toriginal \"github.com/Azure/azure-sdk-for-go/services/policyinsights/mgmt/2019-10-01/policyinsights\"\n", + ")\"\"\"\n", + ")\n", + "assert (\n", + " cleaned_code_c\n", + " == \"\"\"#ifndef _ETS_SYS_H\n", + "#define _ETS_SYS_H\n", + "\n", + "#include \"c_types.h\"\n", + "#include \"eagle_soc.h\"\n", + "\n", + "typedef uint32_t ETSSignal;\"\"\"\n", + ")\n", + "assert (\n", + " cleaned_code_cpp\n", + " == \"\"\"#include \"Common/CRC32.h\"\n", + "\n", + "#include \"Common/Microcontroller/MessageProtocol.h\"\n", + "\n", + "#include \"ClientSource/Libraries/Logging.h\"\n", + "\n", + "#include \"ClientSource/Libraries/MessageConverter.h\"\n", + "\n", + "#include \"BotBaseMessage.h\"\n", + "\n", + "#include \"PABotBaseConnection.h\"\n", + "\n", + "\n", + "\n", + "#include \n", + "\n", + "using std::cout;\"\"\"\n", + ")\n", + "assert (\n", + " cleaned_code_java\n", + " == \"\"\"package info.archinnov.achilles.internals.sample_classes.parser.entity;\n", + "\n", + "import info.archinnov.achilles.annotations.Column;\"\"\"\n", + ")\n", + "assert (\n", + " cleaned_code_javascript\n", + " == \"\"\"const anitomy = require('../anitomy');\n", + "const async = require('async');\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test the nsfw_word.csv generator\n", + "import csv\n", + "\n", + "nsfw_words = [\n", + " 'word1',\n", + " 'word2',\n", + " 'word3',\n", + " 'word4',\n", + " 'word5',\n", + " 'word6'\n", + "]\n", + "\n", + "csv_file_path = 'nsfw_words.csv'\n", + "\n", + "with open(csv_file_path, 'w', newline='') as file:\n", + " writer = csv.writer(file)\n", + " writer.writerow(['Words'])\n", + " writer.writerows([[word] for word in nsfw_words])\n", + "\n", + "print(f\"CSV file '{csv_file_path}' generated successfully.\")\n", + "\n", + "##################\n", + "\n", + "# test the replace_nsfw function\n", + "def replace_nsfw(\n", + " text: str, # The text to replace NSFW words in\n", + " nsfw_words_csv: str, # Path to the CSV file containing list of NSFW words\n", + ") -> str: # The text return to be replaced\n", + " \"\"\"Replace NSFW words from text with an empty string.\"\"\"\n", + " try:\n", + " with open(nsfw_words_csv, 'r') as file:\n", + " reader = csv.reader(file)\n", + " nsfw_words = [row[0] for row in reader][1:] # Exclude the header row\n", + " except (FileNotFoundError, IndexError) as e:\n", + " print(f\"Error! NSFW dictionary could not be found or indexed: {e}\")\n", + " return text\n", + " \n", + " for word in nsfw_words:\n", + " pattern = r\"\\b\" + re.escape(word) + r\"\\b\\s*\"\n", + " text = re.sub(pattern, \"\", text, flags=re.IGNORECASE)\n", + " \n", + " return text" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/settings.ini b/settings.ini index a4df10e..8ec35c0 100644 --- a/settings.ini +++ b/settings.ini @@ -26,7 +26,7 @@ language = English status = 3 user = CarperAI requirements = datasketch==1.5.8 datasets==2.7.1 Faker==15.3.3 fastcore huggingface-hub networkit pydantic rich ftfy scikit-learn -dev_requirements = BeautifulSoup4 fasttext nbdev scrubadub twine sentencepiece code-tokenize langchain==0.0.212 openai +dev_requirements = BeautifulSoup4 fasttext nbdev scrubadub twine sentencepiece code-tokenize langchain==0.0.212 openai code-ast black_formatting = False readme_nb = index.ipynb allowed_metadata_keys = diff --git a/squeakily/_modidx.py b/squeakily/_modidx.py index fd9e618..0f2dd2b 100644 --- a/squeakily/_modidx.py +++ b/squeakily/_modidx.py @@ -5,7 +5,8 @@ 'doc_host': 'https://CarperAI.github.io', 'git_url': 'https://github.com/CarperAI/squeakily', 'lib_path': 'squeakily'}, - 'syms': { 'squeakily.clean': { 'squeakily.clean.fix_utf8_encoding': ('clean.html#fix_utf8_encoding', 'squeakily/clean.py'), + 'syms': { 'squeakily.clean': { 'squeakily.clean.clean_code_license': ('clean.html#clean_code_license', 'squeakily/clean.py'), + 'squeakily.clean.fix_utf8_encoding': ('clean.html#fix_utf8_encoding', 'squeakily/clean.py'), 'squeakily.clean.normalize_punctuation': ('clean.html#normalize_punctuation', 'squeakily/clean.py'), 'squeakily.clean.normalize_whitespace': ('clean.html#normalize_whitespace', 'squeakily/clean.py'), 'squeakily.clean.remove_empty_lines': ('clean.html#remove_empty_lines', 'squeakily/clean.py'), @@ -15,6 +16,7 @@ 'squeakily.clean.replace_ip': ('clean.html#replace_ip', 'squeakily/clean.py'), 'squeakily.clean.replace_phone': ('clean.html#replace_phone', 'squeakily/clean.py'), 'squeakily.clean.replace_ssn': ('clean.html#replace_ssn', 'squeakily/clean.py'), + 'squeakily.clean.replace_nsfw': ('clean.html#replace_nsfw', 'squeakily/clean.py'), 'squeakily.clean.replace_urls': ('clean.html#replace_urls', 'squeakily/clean.py')}, 'squeakily.core': { 'squeakily.core.Pipeline': ('core.html#pipeline', 'squeakily/core.py'), 'squeakily.core.Pipeline.__init__': ('core.html#pipeline.__init__', 'squeakily/core.py'), diff --git a/squeakily/clean.py b/squeakily/clean.py index 7eb5ad9..982dcef 100644 --- a/squeakily/clean.py +++ b/squeakily/clean.py @@ -3,10 +3,11 @@ # %% auto 0 __all__ = ['fake', 'whitespace', 'unicode_punctuation', 'normalize_whitespace', 'normalize_punctuation', 'remove_empty_lines', 'replace_urls', 'replace_dates', 'replace_email', 'replace_phone', 'replace_ip', 'replace_credit_card', - 'replace_ssn', 'fix_utf8_encoding'] + 'replace_ssn', 'fix_utf8_encoding', 'clean_code_license', 'replace_nsfw'] # %% ../nbs/02_clean.ipynb 2 import re +import csv from faker import Faker import ftfy @@ -169,3 +170,57 @@ def fix_utf8_encoding( ) -> str: # The fixed text """Fix utf8 text using ftfy.""" return ftfy.fix_text(text) + +# %% ../nbs/02_clean.ipynb 27 +def clean_code_license( + code: str, # The code to clean + language: str = "python", # The language of the code + min_lines: int = 3, # The minimum number of lines that need to be removed +): + import code_ast + from code_ast import ASTVisitor + from code_ast.ast import LEAVE_WHITELIST + + class FirstNonCommentVisitor(ASTVisitor): + def __init__(self): + self.passed_global_node = False + self.first_node = None + + def visit(self, node): + if not self.passed_global_node: + self.passed_global_node = True + return + if self.first_node is None: + if node.child_count > 0 or node.type in LEAVE_WHITELIST: + self.first_node = node + + """Remove the license or other boilerplate comments from the code.""" + ast = code_ast.ast(code, lang=language) + visitor = FirstNonCommentVisitor() + ast.visit(visitor) + start_line = visitor.first_node.start_point[0] + if start_line < min_lines: + return code + else: + return "\n".join(code.splitlines()[start_line:]) + +# %% ../nbs/02_clean.ipynb 29 +def replace_nsfw( + text: str, # The text to replace NSFW words in + nsfw_words_csv: str, # Path to the CSV file containing list of NSFW words +) -> str: # The text return to be replaced + """Replace NSFW words from text with an empty string.""" + try: + with open(nsfw_words_csv, 'r') as file: + reader = csv.reader(file) + nsfw_words = [row[0] for row in reader][1:] # Exclude the header row + except (FileNotFoundError, IndexError) as e: + print(f"Error! NSFW dictionary could not be found or indexed: {e}") + return text + + for word in nsfw_words: + pattern = r"\b" + re.escape(word) + r"\b\s*" + text = re.sub(pattern, "", text, flags=re.IGNORECASE) + + return text +# %%