13
13
14
14
# Third Party
15
15
# instructlab - All of these need to go away (other than sdg) - issue #6
16
+ from datasets import Dataset
16
17
from xdg_base_dirs import xdg_data_dirs , xdg_data_home
17
18
import openai
18
- import yaml
19
19
20
20
# First Party
21
21
from instructlab .sdg .blocks .llmblock import DEFAULT_MAX_NUM_TOKENS
27
27
Pipeline ,
28
28
PipelineContext ,
29
29
)
30
+ from instructlab .sdg .taxonomy import taxonomy_to_samples
30
31
from instructlab .sdg .utils import GenerateException , models
31
- from instructlab .sdg .utils .json import jldump
32
- from instructlab .sdg .utils .taxonomy import (
33
- leaf_node_to_samples ,
34
- read_taxonomy_leaf_nodes ,
35
- )
32
+ from instructlab .sdg .utils .json import jldump , jlload
36
33
37
34
logger = logging .getLogger (__name__ )
38
35
@@ -115,20 +112,21 @@ def _gen_train_data(
115
112
116
113
def _knowledge_seed_example_to_test_data (seed_example , system_prompt ):
117
114
res = []
118
- for qna in seed_example ["questions_and_answers" ]:
119
- user = qna ["question" ] + "\n " + seed_example ["context" ]
115
+ for i in range (3 ):
116
+ idx = i + 1
117
+ user = seed_example [f"icl_query_{ idx } " ] + "\n " + seed_example ["icl_document" ]
120
118
res .append (
121
119
{
122
120
"system" : system_prompt ,
123
121
"user" : _unescape (user ),
124
- "assistant" : _unescape (qna [ "answer " ]),
122
+ "assistant" : _unescape (seed_example [ f"icl_response_ { idx } " ]),
125
123
}
126
124
)
127
125
return res
128
126
129
127
130
128
def _gen_test_data (
131
- leaf_nodes ,
129
+ seed_examples ,
132
130
output_file_test ,
133
131
system_prompt ,
134
132
):
@@ -137,30 +135,29 @@ def _gen_test_data(
137
135
in instructlab/instructlab.
138
136
"""
139
137
test_data = []
140
- for _ , leaf_node in leaf_nodes .items ():
141
- for seed_example in leaf_node :
142
- if "questions_and_answers" in seed_example :
143
- test_data .extend (
144
- _knowledge_seed_example_to_test_data (seed_example , system_prompt )
145
- )
146
- continue
138
+ for seed_example in seed_examples :
139
+ if "icl_query_1" in seed_example :
140
+ test_data .extend (
141
+ _knowledge_seed_example_to_test_data (seed_example , system_prompt )
142
+ )
143
+ continue
147
144
148
- # skill seed example
145
+ # skill seed example
149
146
150
- user = seed_example ["instruction " ] # question
147
+ user = seed_example ["seed_question " ] # question
151
148
152
- if len ( seed_example ["input" ]) > 0 :
153
- user += "\n " + seed_example ["input " ] # context
149
+ if seed_example ["leaf_node_type" ] == "grounded_skill" :
150
+ user += "\n " + seed_example ["seed_context " ] # context
154
151
155
- test_data .append (
156
- {
157
- "system" : system_prompt ,
158
- "user" : _unescape (user ),
159
- "assistant" : _unescape (seed_example ["output " ]), # answer
160
- }
161
- )
152
+ test_data .append (
153
+ {
154
+ "system" : system_prompt ,
155
+ "user" : _unescape (user ),
156
+ "assistant" : _unescape (seed_example ["seed_response " ]), # answer
157
+ }
158
+ )
162
159
163
- jldump (test_data , output_file_test )
160
+ jldump (test_data , output_file_test )
164
161
165
162
166
163
def _check_pipeline_dir (pipeline ):
@@ -208,23 +205,6 @@ def _sdg_init(ctx, pipeline):
208
205
data_dirs = [os .path .join (xdg_data_home (), "instructlab" , "sdg" )]
209
206
data_dirs .extend (os .path .join (dir , "instructlab" , "sdg" ) for dir in xdg_data_dirs ())
210
207
211
- docling_model_path = None
212
- sdg_models_path = docling_model_path
213
- for d in data_dirs :
214
- if os .path .exists (os .path .join (d , "models" )):
215
- sdg_models_path = os .path .join (d , "models" )
216
- break
217
-
218
- if sdg_models_path is not None :
219
- try :
220
- with open (
221
- os .path .join (sdg_models_path , "config.yaml" ), "r" , encoding = "utf-8"
222
- ) as file :
223
- config = yaml .safe_load (file )
224
- docling_model_path = config ["models" ][0 ]["path" ]
225
- except (FileNotFoundError , NotADirectoryError , PermissionError ) as e :
226
- logger .warning (f"unable to read docling models path from config.yaml { e } " )
227
-
228
208
for d in data_dirs :
229
209
pipeline_path = os .path .join (d , "pipelines" , pipeline )
230
210
if os .path .exists (pipeline_path ):
@@ -256,7 +236,6 @@ def load_pipeline(yaml_basename):
256
236
load_pipeline ("knowledge.yaml" ),
257
237
load_pipeline ("freeform_skills.yaml" ),
258
238
load_pipeline ("grounded_skills.yaml" ),
259
- docling_model_path ,
260
239
)
261
240
262
241
@@ -326,28 +305,32 @@ def generate_data(
326
305
if batch_size is None :
327
306
batch_size = 0
328
307
329
- if not os .path .exists (output_dir ):
330
- os .mkdir (output_dir )
331
-
332
- if not (taxonomy and os .path .exists (taxonomy )):
333
- raise GenerateException (f"Error: taxonomy ({ taxonomy } ) does not exist." )
334
-
308
+ output_dir = Path (output_dir )
309
+ output_dir .mkdir (exist_ok = True )
335
310
date_suffix = datetime .now ().replace (microsecond = 0 ).isoformat ().replace (":" , "_" )
336
- document_output_dir = Path (output_dir ) / f"documents-{ date_suffix } "
337
-
338
- leaf_nodes = read_taxonomy_leaf_nodes (
339
- taxonomy , taxonomy_base , yaml_rules , document_output_dir
311
+ preprocessed_output_dir = output_dir .joinpath (f"preprocessed_{ date_suffix } " )
312
+
313
+ # This writes samples to disk in our output_dir and returns the
314
+ # list of files created
315
+ sample_files = taxonomy_to_samples (
316
+ taxonomy ,
317
+ preprocessed_output_dir ,
318
+ chunk_word_count = chunk_word_count ,
319
+ server_ctx_size = server_ctx_size ,
320
+ taxonomy_base = taxonomy_base ,
321
+ yaml_rules = yaml_rules ,
340
322
)
341
- if not leaf_nodes :
342
- raise GenerateException ("Error: No new leaf nodes found in the taxonomy." )
343
323
344
324
name = Path (model_name ).stem # Just in case it is a file path
345
325
output_file_messages = f"messages_{ name } _{ date_suffix } .jsonl"
346
326
output_file_test = f"test_{ name } _{ date_suffix } .jsonl"
347
327
output_file_train = f"train_{ name } _{ date_suffix } .jsonl"
348
328
329
+ all_samples = []
330
+ for sample_file in sample_files :
331
+ all_samples .extend (jlload (sample_file ))
349
332
_gen_test_data (
350
- leaf_nodes ,
333
+ all_samples ,
351
334
os .path .join (output_dir , output_file_test ),
352
335
system_prompt ,
353
336
)
@@ -368,8 +351,8 @@ def generate_data(
368
351
max_num_tokens = max_num_tokens ,
369
352
)
370
353
371
- knowledge_pipe , freeform_skills_pipe , grounded_skills_pipe , docling_model_path = (
372
- _sdg_init ( ctx , pipeline )
354
+ knowledge_pipe , freeform_skills_pipe , grounded_skills_pipe = _sdg_init (
355
+ ctx , pipeline
373
356
)
374
357
375
358
# Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)
@@ -390,39 +373,34 @@ def generate_data(
390
373
)
391
374
392
375
generated_data = []
393
- empty_sdg_leaf_nodes = []
394
- for leaf_node in leaf_nodes .values ():
395
- is_knowledge = False
396
- leaf_node_path = leaf_node [0 ]["taxonomy_path" ].replace ("->" , "_" )
397
- samples = leaf_node_to_samples (
398
- leaf_node ,
399
- taxonomy ,
400
- server_ctx_size ,
401
- chunk_word_count ,
402
- document_output_dir ,
403
- model_name ,
404
- docling_model_path = docling_model_path ,
405
- )
406
-
376
+ empty_input_sample_files = []
377
+ for sample_file in sample_files :
378
+ logger .debug ("Generating data from input sample file: %s" , sample_file )
379
+ samples = jlload (sample_file )
407
380
if not samples :
408
- raise GenerateException ("Error: No samples found in leaf node." )
409
-
410
- if "document" in samples .column_names :
381
+ raise GenerateException (
382
+ "Error: No samples found in input file {sample_file}"
383
+ )
384
+ # For now we assume every sample in the file is the same type
385
+ first_sample = samples [0 ]
386
+ leaf_node_path = first_sample ["leaf_node_path" ]
387
+ leaf_node_type = first_sample ["leaf_node_type" ]
388
+ is_knowledge = False
389
+ if leaf_node_type == "knowledge" :
411
390
pipe = knowledge_pipe
412
391
is_knowledge = True
413
-
414
- elif "seed_context" in samples .column_names :
392
+ elif leaf_node_type == "grounded_skill" :
415
393
pipe = grounded_skills_pipe
416
-
417
394
else :
418
395
pipe = freeform_skills_pipe
419
396
420
- logger .debug ("Samples: %s" , samples )
397
+ samples_ds = Dataset .from_list (samples )
398
+ logger .debug ("Samples: %s" , samples_ds )
421
399
422
- new_generated_data = pipe .generate (samples , leaf_node_path )
400
+ new_generated_data = pipe .generate (samples_ds , leaf_node_path )
423
401
if len (new_generated_data ) == 0 :
424
- empty_sdg_leaf_nodes .append (leaf_node_path )
425
- logger .warning ("Empty dataset for qna node : %s" , leaf_node_path )
402
+ empty_input_sample_files .append (sample_file )
403
+ logger .warning ("Empty generated dataset for sample file : %s" , sample_file )
426
404
continue
427
405
generated_data .append (new_generated_data )
428
406
@@ -457,9 +435,9 @@ def generate_data(
457
435
458
436
generate_duration = time .time () - generate_start
459
437
logger .info (f"Generation took { generate_duration :.2f} s" )
460
- if len (empty_sdg_leaf_nodes ) > 0 :
438
+ if len (empty_input_sample_files ) > 0 :
461
439
logger .warning (
462
- "Leaf nodes with empty sdg output: {}" .format (
463
- " " .join (empty_sdg_leaf_nodes )
440
+ "Input sample files with empty sdg output: {}" .format (
441
+ " " .join (empty_input_sample_files )
464
442
)
465
443
)
0 commit comments