diff --git a/.gitignore b/.gitignore index 81d072d..f85a530 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,7 @@ __pycache__/ *.py[cod] -data/ - +*.json result/ embeddings.npy \ No newline at end of file diff --git a/data/__INSERT_JSON_DATA_HERE__.txt b/data/__INSERT_JSON_DATA_HERE__.txt new file mode 100644 index 0000000..e69de29 diff --git a/initializing.py b/initializing.py new file mode 100644 index 0000000..42fd0ab --- /dev/null +++ b/initializing.py @@ -0,0 +1,25 @@ +from src.data_preparation import create_embeddings +from src.utils.file_io import load_data +import argparse +import numpy as np +import os + +from src.CONSTANT import DEFAULT_JSON_PATH + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--json_path", type=str, default=DEFAULT_JSON_PATH) + args = parser.parse_args() + + parent_path = directory_path = os.path.dirname(args.json_path) + if not os.path.exists(parent_path): + os.makedirs(parent_path) + else: + print("data 폴더가 이미 존재함.") + + data = load_data(args.json_path) + annotations = [item['annotation'] for item in data] + + embeddings, model = create_embeddings(annotations) + np.save('embeddings.npy', embeddings) + diff --git a/src/main.py b/main.py similarity index 70% rename from src/main.py rename to main.py index 2f726d2..5dace38 100644 --- a/src/main.py +++ b/main.py @@ -1,11 +1,12 @@ import json import numpy as np -from data_preparation import load_data -from search_system import create_faiss_index, search_and_save_results +from src.utils.file_io import load_data, rePath +from src.search_system import create_faiss_index, search_and_save_results from sentence_transformers import SentenceTransformer if __name__ == "__main__": data = load_data('data/raw_annotation.json') + data = rePath(data, "/content/dataset/female/", "/content/NextStar-Search/data/nextstar_dataset/train_raw") embeddings = np.load('embeddings.npy') index = create_faiss_index(embeddings) diff --git a/src/CONSTANT.py b/src/CONSTANT.py new file mode 100644 index 0000000..f982f8d --- /dev/null +++ b/src/CONSTANT.py @@ -0,0 +1,2 @@ +model_name = 'distiluse-base-multilingual-cased-v2' +DEFAULT_JSON_PATH = './data/raw_annotation.json' \ No newline at end of file diff --git a/src/data_preparation.py b/src/data_preparation.py index 522109f..28d53d9 100644 --- a/src/data_preparation.py +++ b/src/data_preparation.py @@ -1,19 +1,9 @@ +from src.CONSTANT import model_name import json import numpy as np from sentence_transformers import SentenceTransformer -def load_data(json_path): - with open(json_path, 'r') as f: - return json.load(f) - def create_embeddings(annotations): - model = SentenceTransformer('distiluse-base-multilingual-cased-v2') + model = SentenceTransformer(model_name) embeddings = model.encode(annotations) - return embeddings, model - -if __name__ == "__main__": - data = load_data('data/raw_annotation.json') - annotations = [item['annotation'] for item in data] - - embeddings, model = create_embeddings(annotations) - np.save('embeddings.npy', embeddings) + return embeddings, model \ No newline at end of file diff --git a/src/utils/file_io.py b/src/utils/file_io.py new file mode 100644 index 0000000..141beb0 --- /dev/null +++ b/src/utils/file_io.py @@ -0,0 +1,21 @@ +import json + +def load_data(json_path): + with open(json_path, 'r') as f: + return json.load(f) + +def load_korean_data(json_path): + with open(json_path, "r", encoding='utf-8') as f: + return json.load(f) + +def write_korean_data(json_path, data): + with open(json_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=4, ensure_ascii=False) + +def rePath(data_list, path_from,path_to): + if isinstance(data_list, list) is False: + data_list = [data_list['image'].replace(path_from, path_to)] + + for item in data_list: + item['image'] = item['image'].replace(path_from, path_to) + return data_list \ No newline at end of file