Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion nbs/00_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@
" elif output_type == \"json\":\n",
" datasource[\"dataset\"].to_json(filepath, index=False)\n",
" else:\n",
" logger.error(f\"Invalid output_type: {output_type}. Skipping export for {name} dataset.\")\n",
" logger.error(\n",
" f\"Invalid output_type: {output_type}. Skipping export for {name} dataset.\"\n",
" )\n",
" logger.info(f\"Exported {name} dataset to {filepath}\")\n",
" except Exception as e:\n",
" logger.error(\n",
Expand Down
4 changes: 3 additions & 1 deletion nbs/01_filter.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -895,7 +895,9 @@
" threshold: float = 0.85, # The threshold to use for calculating the false positive rate.\n",
" column: str = \"content\", # The column to use for calculating the false positive rate.\n",
" verbose: bool = False,\n",
") -> Set: # The set of duplicate ids that should be removed, leaving only one id in each community.\n",
") -> (\n",
" Set\n",
"): # The set of duplicate ids that should be removed, leaving only one id in each community.\n",
" \"\"\"\n",
" Find the duplicate communities from the queried dataset.\n",
" \"\"\"\n",
Expand Down
347 changes: 347 additions & 0 deletions nbs/02_clean.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,353 @@
"assert fix_utf8_encoding(bad_text) == \"PÉREZ\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# | export\n",
"def clean_code_license(\n",
" code: str, # The code to clean\n",
" language: str = \"python\", # The language of the code\n",
" min_lines: int = 3, # The minimum number of lines that need to be removed\n",
"):\n",
" import code_ast\n",
" from code_ast import ASTVisitor\n",
" from code_ast.ast import LEAVE_WHITELIST\n",
"\n",
" class FirstNonCommentVisitor(ASTVisitor):\n",
" def __init__(self):\n",
" self.passed_global_node = False\n",
" self.first_node = None\n",
"\n",
" def visit(self, node):\n",
" if not self.passed_global_node:\n",
" self.passed_global_node = True\n",
" return\n",
" if self.first_node is None:\n",
" if node.child_count > 0 or node.type in LEAVE_WHITELIST:\n",
" self.first_node = node\n",
"\n",
" \"\"\"Remove the license or other boilerplate comments from the code.\"\"\"\n",
" ast = code_ast.ast(code, lang=language)\n",
" visitor = FirstNonCommentVisitor()\n",
" ast.visit(visitor)\n",
" start_line = visitor.first_node.start_point[0]\n",
" if start_line < min_lines:\n",
" return code\n",
" else:\n",
" return \"\\n\".join(code.splitlines()[start_line:])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# |eval: false\n",
"# Test the cleaning of code licenses or similar boilerplate comments from code\n",
"code_python = \"\"\"# -*- coding: utf-8 -*-\n",
"\n",
"# Copyright 2018 Spanish National Research Council (CSIC)\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\"); you may\n",
"# not use this file except in compliance with the License. You may obtain\n",
"# a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n",
"# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n",
"# License for the specific language governing permissions and limitations\n",
"# under the License.\n",
"\n",
"\\\"\\\"\\\"\n",
"Given two dates and region, download N Sentinel Collections scenes from ESA\n",
"Sentinel dataHUB.\n",
"The downloaded Sentinel collection scenes are compatible with:\n",
"S2MSI1C: Top-of-atmosphere reflectances in cartographic geometry\n",
"or S2MSI2A: Bottom-of-atmosphere reflectance in cartographic geometry\n",
"Parameters\n",
"----------\n",
"inidate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n",
"enddate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n",
"region: name of one reservoir saved in the \"coord_reservoirs.json\" file\n",
"coordinates : dict. Coordinates of the region to search.\n",
"Example: {\"W\": -2.830, \"S\": 41.820, \"E\": -2.690, \"N\": 41.910}}\n",
"platform : str. Satellite to use from the Sentinel family\n",
"producttype : str. Dataset type.\n",
"cloud: int\n",
"path : path\n",
"Author: Daniel García Díaz\n",
"Email: [email protected]\n",
"Institute of Physics of Cantabria (IFCA)\n",
"Advanced Computing and e-Science\n",
"Date: Sep 2018\n",
"\\\"\\\"\\\"\n",
"#imports apis\n",
"import requests\n",
"import os\n",
"\n",
"# Subfunctions\n",
"from wq_sat.utils import config\n",
"\"\"\"\n",
"\n",
"code_go = \"\"\"// +build go1.9\n",
"\n",
"// Copyright 2019 Microsoft Corporation\n",
"//\n",
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"// you may not use this file except in compliance with the License.\n",
"// You may obtain a copy of the License at\n",
"//\n",
"// http://www.apache.org/licenses/LICENSE-2.0\n",
"//\n",
"// Unless required by applicable law or agreed to in writing, software\n",
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"// See the License for the specific language governing permissions and\n",
"// limitations under the License.\n",
"\n",
"// This code was auto-generated by:\n",
"// github.com/Azure/azure-sdk-for-go/tools/profileBuilder\n",
"\n",
"package policyinsights\n",
"\n",
"import (\n",
"\t\"context\"\n",
"\n",
"\toriginal \"github.com/Azure/azure-sdk-for-go/services/policyinsights/mgmt/2019-10-01/policyinsights\"\n",
")\n",
"\"\"\"\n",
"\n",
"code_c = \"\"\"/*\n",
" * copyright (c) 2008 - 2011 Espressif System\n",
" *\n",
" * Define user specified Event signals and Task priorities here\n",
" *\n",
" */\n",
"\n",
"#ifndef _ETS_SYS_H\n",
"#define _ETS_SYS_H\n",
"\n",
"#include \"c_types.h\"\n",
"#include \"eagle_soc.h\"\n",
"\n",
"typedef uint32_t ETSSignal;\n",
"\"\"\"\n",
"\n",
"code_cpp = \"\"\"/* Pokemon Automation Bot Base - Client Example\n",
"\n",
" * \n",
"\n",
" * From: https://github.com/PokemonAutomation/Arduino-Source\n",
"\n",
" * \n",
"\n",
" */\n",
"\n",
"\n",
"\n",
"#include \"Common/CRC32.h\"\n",
"\n",
"#include \"Common/Microcontroller/MessageProtocol.h\"\n",
"\n",
"#include \"ClientSource/Libraries/Logging.h\"\n",
"\n",
"#include \"ClientSource/Libraries/MessageConverter.h\"\n",
"\n",
"#include \"BotBaseMessage.h\"\n",
"\n",
"#include \"PABotBaseConnection.h\"\n",
"\n",
"\n",
"\n",
"#include \n",
"\n",
"using std::cout;\n",
"\"\"\"\n",
"\n",
"code_java = \"\"\"/*\n",
" * Copyright (C) 2012-2021 DuyHai DOAN\n",
" *\n",
" * Licensed under the Apache License, Version 2.0 (the \"License\");\n",
" * you may not use this file except in compliance with the License.\n",
" * You may obtain a copy of the License at\n",
" *\n",
" * http://www.apache.org/licenses/LICENSE-2.0\n",
" *\n",
" * Unless required by applicable law or agreed to in writing, software\n",
" * distributed under the License is distributed on an \"AS IS\" BASIS,\n",
" * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
" * See the License for the specific language governing permissions and\n",
" * limitations under the License.\n",
" */\n",
"\n",
"package info.archinnov.achilles.internals.sample_classes.parser.entity;\n",
"\n",
"import info.archinnov.achilles.annotations.Column;\n",
"\"\"\"\n",
"\n",
"code_javascript = \"\"\"/*\n",
"** Copyright (c) 2016-2019, Thomas Farr\n",
"**\n",
"** This Source Code Form is subject to the terms of the Mozilla Public\n",
"** License, v. 2.0. If a copy of the MPL was not distributed with this\n",
"** file, You can obtain one at https://mozilla.org/MPL/2.0/.\n",
"*/\n",
"\n",
"// TODO: Implement testing of option handling, and filename arrays\n",
"\n",
"const anitomy = require('../anitomy');\n",
"const async = require('async');\n",
"\"\"\"\n",
"\n",
"cleaned_code_python = clean_code_license(code_python, language=\"python\")\n",
"cleaned_code_go = clean_code_license(code_go, language=\"go\")\n",
"cleaned_code_c = clean_code_license(code_c, language=\"c\")\n",
"cleaned_code_cpp = clean_code_license(code_cpp, language=\"cpp\")\n",
"cleaned_code_java = clean_code_license(code_java, language=\"java\")\n",
"cleaned_code_javascript = clean_code_license(code_javascript, language=\"javascript\")\n",
"\n",
"assert (\n",
" cleaned_code_python\n",
" == \"\"\"\\\"\\\"\\\"\n",
"Given two dates and region, download N Sentinel Collections scenes from ESA\n",
"Sentinel dataHUB.\n",
"The downloaded Sentinel collection scenes are compatible with:\n",
"S2MSI1C: Top-of-atmosphere reflectances in cartographic geometry\n",
"or S2MSI2A: Bottom-of-atmosphere reflectance in cartographic geometry\n",
"Parameters\n",
"----------\n",
"inidate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n",
"enddate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n",
"region: name of one reservoir saved in the \"coord_reservoirs.json\" file\n",
"coordinates : dict. Coordinates of the region to search.\n",
"Example: {\"W\": -2.830, \"S\": 41.820, \"E\": -2.690, \"N\": 41.910}}\n",
"platform : str. Satellite to use from the Sentinel family\n",
"producttype : str. Dataset type.\n",
"cloud: int\n",
"path : path\n",
"Author: Daniel García Díaz\n",
"Email: [email protected]\n",
"Institute of Physics of Cantabria (IFCA)\n",
"Advanced Computing and e-Science\n",
"Date: Sep 2018\n",
"\\\"\\\"\\\"\n",
"#imports apis\n",
"import requests\n",
"import os\n",
"\n",
"# Subfunctions\n",
"from wq_sat.utils import config\"\"\"\n",
")\n",
"assert (\n",
" cleaned_code_go\n",
" == \"\"\"package policyinsights\n",
"\n",
"import (\n",
"\t\"context\"\n",
"\n",
"\toriginal \"github.com/Azure/azure-sdk-for-go/services/policyinsights/mgmt/2019-10-01/policyinsights\"\n",
")\"\"\"\n",
")\n",
"assert (\n",
" cleaned_code_c\n",
" == \"\"\"#ifndef _ETS_SYS_H\n",
"#define _ETS_SYS_H\n",
"\n",
"#include \"c_types.h\"\n",
"#include \"eagle_soc.h\"\n",
"\n",
"typedef uint32_t ETSSignal;\"\"\"\n",
")\n",
"assert (\n",
" cleaned_code_cpp\n",
" == \"\"\"#include \"Common/CRC32.h\"\n",
"\n",
"#include \"Common/Microcontroller/MessageProtocol.h\"\n",
"\n",
"#include \"ClientSource/Libraries/Logging.h\"\n",
"\n",
"#include \"ClientSource/Libraries/MessageConverter.h\"\n",
"\n",
"#include \"BotBaseMessage.h\"\n",
"\n",
"#include \"PABotBaseConnection.h\"\n",
"\n",
"\n",
"\n",
"#include \n",
"\n",
"using std::cout;\"\"\"\n",
")\n",
"assert (\n",
" cleaned_code_java\n",
" == \"\"\"package info.archinnov.achilles.internals.sample_classes.parser.entity;\n",
"\n",
"import info.archinnov.achilles.annotations.Column;\"\"\"\n",
")\n",
"assert (\n",
" cleaned_code_javascript\n",
" == \"\"\"const anitomy = require('../anitomy');\n",
"const async = require('async');\"\"\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# test the nsfw_word.csv generator\n",
"import csv\n",
"\n",
"nsfw_words = [\n",
" 'word1',\n",
" 'word2',\n",
" 'word3',\n",
" 'word4',\n",
" 'word5',\n",
" 'word6'\n",
"]\n",
"\n",
"csv_file_path = 'nsfw_words.csv'\n",
"\n",
"with open(csv_file_path, 'w', newline='') as file:\n",
" writer = csv.writer(file)\n",
" writer.writerow(['Words'])\n",
" writer.writerows([[word] for word in nsfw_words])\n",
"\n",
"print(f\"CSV file '{csv_file_path}' generated successfully.\")\n",
"\n",
"##################\n",
"\n",
"# test the replace_nsfw function\n",
"def replace_nsfw(\n",
" text: str, # The text to replace NSFW words in\n",
" nsfw_words_csv: str, # Path to the CSV file containing list of NSFW words\n",
") -> str: # The text return to be replaced\n",
" \"\"\"Replace NSFW words from text with an empty string.\"\"\"\n",
" try:\n",
" with open(nsfw_words_csv, 'r') as file:\n",
" reader = csv.reader(file)\n",
" nsfw_words = [row[0] for row in reader][1:] # Exclude the header row\n",
" except (FileNotFoundError, IndexError) as e:\n",
" print(f\"Error! NSFW dictionary could not be found or indexed: {e}\")\n",
" return text\n",
" \n",
" for word in nsfw_words:\n",
" pattern = r\"\\b\" + re.escape(word) + r\"\\b\\s*\"\n",
" text = re.sub(pattern, \"\", text, flags=re.IGNORECASE)\n",
" \n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
2 changes: 1 addition & 1 deletion settings.ini
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ language = English
status = 3
user = CarperAI
requirements = datasketch==1.5.8 datasets==2.7.1 Faker==15.3.3 fastcore huggingface-hub networkit pydantic rich ftfy scikit-learn
dev_requirements = BeautifulSoup4 fasttext nbdev scrubadub twine sentencepiece code-tokenize langchain==0.0.212 openai
dev_requirements = BeautifulSoup4 fasttext nbdev scrubadub twine sentencepiece code-tokenize langchain==0.0.212 openai code-ast
black_formatting = False
readme_nb = index.ipynb
allowed_metadata_keys =
Expand Down
Loading