-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
60 lines (47 loc) · 2.91 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import torch
import pandas as pd
import argparse
from laiser.skill_extractor import Skill_Extractor
# TODO: verify if everything is working fine with the latest version of the library
# Check with and without GPU availablility
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Run Skill Extractor on jobs and syllabi data.')
parser.add_argument('--HF_TOKEN', type=str, default=None, help='Hugging Face token for authentication')
parser.add_argument('--AI_MODEL_ID', type=str, default=None, help='Model name for Skill Extractor')
parser.add_argument('--use_gpu', type=str, default=str(torch.cuda.is_available()), help='Enable or disable GPU use.')
parser.add_argument('--batch_size', type=int, default=32, help='Batch size for skills extraction')
args = parser.parse_args()
use_gpu = True if args.use_gpu == "True" else False
print('\n\nInitializing the Skill Extractor...')
se = Skill_Extractor(AI_MODEL_ID=args.AI_MODEL_ID, HF_TOKEN=args.HF_TOKEN, use_gpu=use_gpu)
print('The Skill Extractor has been initialized successfully!\n')
# Skill extraction from jobs data
print('\n\nLoading a sample dataset of 50 jobs...')
nlx_sample = pd.read_csv('https://raw.githubusercontent.com/LAiSER-Software/datasets/refs/heads/master/jobs-data/nlx_job_data_50rows.csv')
print('The sample jobs dataset has been loaded successfully!\n')
nlx_sample = nlx_sample[['description', 'job_id']]
nlx_sample = nlx_sample[1:3]
print('The sample dataset has been filtered successfully!\n')
print('Head of the sample:\n', nlx_sample.head())
output = se.extractor(nlx_sample, 'job_id', text_columns=['description'], batch_size=args.batch_size)
print('The skills have been extracted from jobs data successfully...\n')
# Save the extracted skills to a CSV file
print(output)
file_name = f'extracted_skills_for_{len(nlx_sample)}Jobs.csv'
output.to_csv(file_name, index=False)
print('The extracted skills have been saved to the file named:', file_name)
# Skill extraction from syllabi data
print('\n\nLoading a sample dataset of 50 syllabi...')
syllabi_sample = pd.read_csv('https://raw.githubusercontent.com/LAiSER-Software/datasets/refs/heads/master/syllabi-data/preprocessed_50_opensyllabus_syllabi_data.csv')
print('The sample syllabi dataset has been loaded successfully!\n')
syllabi_sample = syllabi_sample[['id', 'description', 'learning_outcomes']]
syllabi_sample = syllabi_sample[1:3]
print('The sample dataset has been filtered successfully!\n')
print('Head of the sample:\n', syllabi_sample.head())
output = se.extractor(syllabi_sample, 'id', text_columns=['description', 'learning_outcomes'], input_type='syllabus', batch_size=args.batch_size)
print('The skills have been extracted from syllabi data successfully...\n')
# Save the extracted skills to a CSV file
print(output)
file_name = f'extracted_skills_for_{len(syllabi_sample)}Syllabi.csv'
output.to_csv(file_name, index=False)
print('The extracted skills have been saved to the file named:', file_name)