-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathupdate_data.sh
More file actions
executable file
Β·330 lines (272 loc) Β· 9.89 KB
/
update_data.sh
File metadata and controls
executable file
Β·330 lines (272 loc) Β· 9.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/bin/bash
# update_data.sh - Automated Scribe-Data update script for Scribe-Server
set -e # exit immediately on error
# MARK: Config
SCRIBE_DATA_DIR="Scribe-Data"
TEMP_DIR="/tmp/scribe-data-update"
PACKS_DIR="./packs/sqlite"
VENV_DIR="./.venv"
LOG_FILE="/tmp/scribe-data-update.log"
SKIP_MIGRATION=${1:-false}
# Save project root.
PROJECT_ROOT=$(pwd)
# Define target languages and data types.
TARGET_LANGUAGES=("english" "french" "german" "italian" "spanish" "portuguese" "russian" "swedish")
DATA_TYPES=("nouns" "verbs")
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # no color
# Logging functions.
log() {
echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1" | tee -a "$LOG_FILE"
}
error() {
echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
}
success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
}
warning() {
echo -e "${YELLOW}[WARNING]${NC} $1" | tee -a "$LOG_FILE"
}
# Cleanup for temp directory.
cleanup() {
if [ -d "$TEMP_DIR" ]; then
log "Cleaning up temporary directory: $TEMP_DIR"
rm -rf "$TEMP_DIR"
fi
}
trap cleanup EXIT
# MARK: Enter TMP Dir
mkdir -p "$TEMP_DIR"
cd "$TEMP_DIR"
log "π Starting Scribe-Data update process..."
log "Working directory: $(pwd)"
log "Log file: $LOG_FILE"
# MARK: Get Scribe-Data
log "π¦ Setting up Scribe-Data repository..."
if [ ! -d "$SCRIBE_DATA_DIR" ]; then
log "Cloning Scribe-Data repository..."
git clone --depth=1 https://github.com/scribe-org/Scribe-Data.git "$SCRIBE_DATA_DIR" || {
error "Failed to clone Scribe-Data repo"
exit 1
}
success "Repository cloned successfully"
else
log "Repository exists, updating..."
cd "$SCRIBE_DATA_DIR"
git pull origin main || warning "Failed to update repository, continuing with existing version"
cd ..
fi
cd "$SCRIBE_DATA_DIR"
# MARK: Python / Pip
log "π Checking Python environment..."
if ! command -v python3 &> /dev/null; then
error "Python3 is not installed. Please install Python3 first."
exit 1
fi
if ! command -v pip &> /dev/null && ! command -v pip3 &> /dev/null; then
warning "pip not found. Attempting to download and install pip..."
if [ ! -f "get-pip.py" ]; then
log "Downloading get-pip.py from PyPA..."
curl -sS https://bootstrap.pypa.io/get-pip.py -o get-pip.py || {
error "Failed to download get-pip.py"
exit 1
}
fi
python3 get-pip.py || {
error "Failed to install pip"
exit 1
}
success "pip installed successfully"
fi
# MARK: Make Venv
log "π§ͺ Setting up virtual environment..."
if [ ! -d "$VENV_DIR" ]; then
python3 -m venv "$VENV_DIR" || {
error "Failed to create virtual environment"
exit 1
}
success "Virtual environment created at $VENV_DIR"
else
log "Using existing virtual environment at $VENV_DIR"
fi
log "π¬ Activating virtual environment..."
source "$VENV_DIR/bin/activate" || {
error "Failed to activate virtual environment"
exit 1
}
success "Virtual environment activated"
# MARK: Dependencies
log "π Installing Scribe-Data dependencies..."
pip install --upgrade pip
pip install -e . || {
error "Failed to install Scribe-Data dependencies"
exit 1
}
success "Dependencies installed successfully"
# MARK: Download Wikidata Dump First
DUMP_DIR="./scribe_data_wikidata_dumps_export"
DUMP_FILE="$DUMP_DIR/latest-lexemes.json.bz2"
if [ ! -f "$DUMP_FILE" ]; then
log "π₯ Downloading Wikidata lexeme dump..."
# Auto-confirm the download prompt with "y" for the initial confirmation.
echo "y" | scribe-data download -wdv latest || {
error "Failed to download Wikidata dump"
exit 1
}
success "Wikidata dump downloaded successfully"
else
log "β
Wikidata dump already exists: $DUMP_FILE"
fi
# MARK: Generate Language Data
log "β‘ Generating language data for target languages (auto-confirm)..."
# Convert arrays to space-separated strings (no quotes around the expansion).
LANG_STRING="${TARGET_LANGUAGES[*]}"
DATA_TYPES_STRING="${DATA_TYPES[*]}"
log "Languages: $LANG_STRING"
log "Data types: $DATA_TYPES_STRING"
# Calculate total number of combinations for the responses.
NUM_LANGUAGES=${#TARGET_LANGUAGES[@]}
NUM_DATA_TYPES=${#DATA_TYPES[@]}
TOTAL_COMBINATIONS=$((NUM_LANGUAGES * NUM_DATA_TYPES))
log "Total combinations to process: $TOTAL_COMBINATIONS"
log "Each combination will prompt to 'Use existing latest dump'"
log "Running: scribe-data get -l $LANG_STRING -dt $DATA_TYPES_STRING -wdp $DUMP_DIR"
# Send Down Arrow twice + Enter for each combination.
# This selects the 3rd option "Use existing latest dump".
{
for ((i=1; i<=TOTAL_COMBINATIONS; i++)); do
printf "\033[B\033[B\n" # Down arrow, Down arrow, Enter
done
} | scribe-data get -l $LANG_STRING -dt $DATA_TYPES_STRING -wdp "$DUMP_DIR"
success "Language data generated successfully"
# Sanity Check: Verify generated files.
log "π Checking generated data in scribe_data_json_export..."
if [ -d "scribe_data_json_export" ]; then
# List all generated JSON files organized by language.
for lang in "${TARGET_LANGUAGES[@]}"; do
if [ -d "scribe_data_json_export/$lang" ]; then
log "π $lang:"
find "scribe_data_json_export/$lang" -name "*.json" | sort | while read -r file; do
filename=$(basename "$file")
log " β
$filename"
done
else
log "β οΈ Missing directory: scribe_data_json_export/$lang"
fi
done
# Count total files.
total_files=$(find scribe_data_json_export -name "*.json" | wc -l)
expected_files=$TOTAL_COMBINATIONS
log "π Generated $total_files/$expected_files JSON files"
if [ "$total_files" -lt "$expected_files" ]; then
error "β οΈ Expected $expected_files files but only found $total_files"
error "Some data may not have been generated successfully"
fi
else
error "scribe_data_json_export directory not found"
exit 1
fi
# MARK: Filter Data
CONTRACTS_DIR="$PROJECT_ROOT/contracts"
log "π Filtering JSON data using contracts..."
if [ ! -d "$CONTRACTS_DIR" ]; then
warning "Contracts directory not found: $CONTRACTS_DIR"
warning "Skipping filtering step - proceeding with unfiltered data"
FILTERED_EXPORT_DIR="./scribe_data_json_export" # use original data
else
FILTERED_EXPORT_DIR="./scribe_data_json_filtered"
mkdir -p "$FILTERED_EXPORT_DIR"
log "Running: scribe-data fd -cd $CONTRACTS_DIR -id scribe_data_json_export -od $FILTERED_EXPORT_DIR"
scribe-data fd -cd "$CONTRACTS_DIR" -id scribe_data_json_export -od "$FILTERED_EXPORT_DIR" || {
error "Failed to filter JSON data"
exit 1
}
success "JSON data filtered successfully"
# Debug: Check filtered files.
if [ -d "$FILTERED_EXPORT_DIR" ]; then
filtered_files=$(find "$FILTERED_EXPORT_DIR" -name "*.json" | wc -l)
log "π Generated $filtered_files filtered JSON files"
fi
# MARK: Convert Filtered Data to SQLite
log "ποΈ Converting filtered data to SQLite format..."
scribe-data convert -if "$FILTERED_EXPORT_DIR" -lang $LANG_STRING -dt $DATA_TYPES_STRING -ot sqlite || {
error "Failed to convert filtered data to SQLite format"
exit 1
}
success "Filtered data converted to SQLite successfully"
fi
# MARK: Check SQLite
SQLITE_EXPORT_DIR="./scribe_data_sqlite_export"
if [ ! -d "$SQLITE_EXPORT_DIR" ]; then
error "SQLite export directory not found: $SQLITE_EXPORT_DIR"
exit 1
fi
SQLITE_FILES=$(find "$SQLITE_EXPORT_DIR" -name "*.sqlite" | wc -l)
if [ "$SQLITE_FILES" -eq 0 ]; then
error "No SQLite files found in $SQLITE_EXPORT_DIR"
exit 1
fi
log "Found $SQLITE_FILES SQLite files to copy"
# MARK: To Packs
cd "$PROJECT_ROOT"
mkdir -p "$PACKS_DIR"
log "π Copying SQLite files to server..."
cp -f "$TEMP_DIR/$SCRIBE_DATA_DIR/scribe_data_sqlite_export"/*.sqlite "$PACKS_DIR/" || {
error "Failed to copy SQLite files to $PACKS_DIR"
exit 1
}
log "Copied files:"
ls -la "$PACKS_DIR"/*.sqlite | while read -r line; do
log " β
$line"
done
success "SQLite files copied successfully"
# MARK: Migration
if [ "$SKIP_MIGRATION" != "true" ]; then
log "π Running database migration..."
make migrate || {
error "Migration failed"
exit 1
}
success "Database migration completed successfully"
else
log "βοΈ Skipping migration (running in CI/CD)"
fi
# MARK: Finish
log "π§Ή Deactivating virtual environment..."
deactivate
success "Virtual environment deactivated"
END_TIME=$(date '+%Y-%m-%d %H:%M:%S')
success "β¨ Scribe-Data update process completed successfully at $END_TIME"
log "π Update Summary:"
log " β’ Repository: Updated/Cloned"
log " β’ Virtual Environment: Reused or created at $VENV_DIR"
log " β’ Dependencies: Installed"
log " β’ Languages processed: ${#TARGET_LANGUAGES[@]} (${TARGET_LANGUAGES[*]})"
log " β’ Data types processed: ${#DATA_TYPES[@]}"
log " β’ Total combinations: $TOTAL_COMBINATIONS"
log " β’ Data Generation: Completed"
log " β’ SQLite Conversion: Completed"
log " β’ Files Copied: $SQLITE_FILES files"
log " β’ Migration: Completed"
log " β’ Log file: $LOG_FILE"
# MARK: Export Stats to GitHub Actions
# This checks if we are running in GitHub Actions and writes the variables.
if [ -n "$GITHUB_OUTPUT" ]; then
echo "Exporting stats to GitHub Output..."
echo "LANG_COUNT=${#TARGET_LANGUAGES[@]}" >> "$GITHUB_OUTPUT"
echo "LANG_LIST=${TARGET_LANGUAGES[*]}" >> "$GITHUB_OUTPUT"
echo "TYPES_COUNT=${#DATA_TYPES[@]}" >> "$GITHUB_OUTPUT"
echo "TYPES_LIST=${DATA_TYPES[*]}" >> "$GITHUB_OUTPUT"
echo "SQLITE_COUNT=$SQLITE_FILES" >> "$GITHUB_OUTPUT"
fi
echo
success "π Scribe-Data has been updated and migrated to MariaDB!"
echo
log "Next steps:"
log " β’ Restart your server if needed"
log " β’ Test the /data-version/:language_iso endpoints"
log " β’ Check the log file for detailed information: $LOG_FILE"