Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
__pycache__/
*.py[cod]

data/

*.json
result/

embeddings.npy
Empty file.
25 changes: 25 additions & 0 deletions initializing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from src.data_preparation import create_embeddings
from src.utils.file_io import load_data
import argparse
import numpy as np
import os

from src.CONSTANT import DEFAULT_JSON_PATH

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--json_path", type=str, default=DEFAULT_JSON_PATH)
args = parser.parse_args()

parent_path = directory_path = os.path.dirname(args.json_path)
if not os.path.exists(parent_path):
os.makedirs(parent_path)
else:
print("data 폴더가 이미 존재함.")

data = load_data(args.json_path)
annotations = [item['annotation'] for item in data]

embeddings, model = create_embeddings(annotations)
np.save('embeddings.npy', embeddings)

5 changes: 3 additions & 2 deletions src/main.py → main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import json
import numpy as np
from data_preparation import load_data
from search_system import create_faiss_index, search_and_save_results
from src.utils.file_io import load_data, rePath
from src.search_system import create_faiss_index, search_and_save_results
from sentence_transformers import SentenceTransformer

if __name__ == "__main__":
data = load_data('data/raw_annotation.json')
data = rePath(data, "/content/dataset/female/", "/content/NextStar-Search/data/nextstar_dataset/train_raw")
embeddings = np.load('embeddings.npy')

index = create_faiss_index(embeddings)
Expand Down
2 changes: 2 additions & 0 deletions src/CONSTANT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
model_name = 'distiluse-base-multilingual-cased-v2'
DEFAULT_JSON_PATH = './data/raw_annotation.json'
16 changes: 3 additions & 13 deletions src/data_preparation.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,9 @@
from src.CONSTANT import model_name
import json
import numpy as np
from sentence_transformers import SentenceTransformer

def load_data(json_path):
with open(json_path, 'r') as f:
return json.load(f)

def create_embeddings(annotations):
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
model = SentenceTransformer(model_name)
embeddings = model.encode(annotations)
return embeddings, model

if __name__ == "__main__":
data = load_data('data/raw_annotation.json')
annotations = [item['annotation'] for item in data]

embeddings, model = create_embeddings(annotations)
np.save('embeddings.npy', embeddings)
return embeddings, model
21 changes: 21 additions & 0 deletions src/utils/file_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import json

def load_data(json_path):
with open(json_path, 'r') as f:
return json.load(f)

def load_korean_data(json_path):
with open(json_path, "r", encoding='utf-8') as f:
return json.load(f)

def write_korean_data(json_path, data):
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)

def rePath(data_list, path_from,path_to):
if isinstance(data_list, list) is False:
data_list = [data_list['image'].replace(path_from, path_to)]

for item in data_list:
item['image'] = item['image'].replace(path_from, path_to)
return data_list