Scribe-Server/update_data.sh at main · scribe-org/Scribe-Server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/bin/bash
# update_data.sh - Automated Scribe-Data update script for Scribe-Server

set -e  # exit immediately on error

# MARK: Config

SCRIBE_DATA_DIR="Scribe-Data"
TEMP_DIR="/tmp/scribe-data-update"
PACKS_DIR="./packs/sqlite"
VENV_DIR="./.venv"
LOG_FILE="/tmp/scribe-data-update.log"
SKIP_MIGRATION=${1:-false}

# Save project root.
PROJECT_ROOT=$(pwd)

# Define target languages and data types.
TARGET_LANGUAGES=("english" "french" "german" "italian" "spanish" "portuguese" "russian" "swedish")
DATA_TYPES=("nouns" "verbs")

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # no color

# Logging functions.
log() {
    echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1" | tee -a "$LOG_FILE"
}
error() {
    echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
}
success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
}
warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1" | tee -a "$LOG_FILE"
}

# Cleanup for temp directory.
cleanup() {
    if [ -d "$TEMP_DIR" ]; then
        log "Cleaning up temporary directory: $TEMP_DIR"
        rm -rf "$TEMP_DIR"
    fi
}
trap cleanup EXIT

# MARK: Enter TMP Dir

mkdir -p "$TEMP_DIR"
cd "$TEMP_DIR"

log "🚀 Starting Scribe-Data update process..."
log "Working directory: $(pwd)"
log "Log file: $LOG_FILE"

# MARK: Get Scribe-Data

log "📦 Setting up Scribe-Data repository..."
if [ ! -d "$SCRIBE_DATA_DIR" ]; then
    log "Cloning Scribe-Data repository..."
    git clone --depth=1 https://github.com/scribe-org/Scribe-Data.git "$SCRIBE_DATA_DIR" || {
        error "Failed to clone Scribe-Data repo"
        exit 1
    }
    success "Repository cloned successfully"
else
    log "Repository exists, updating..."
    cd "$SCRIBE_DATA_DIR"
    git pull origin main || warning "Failed to update repository, continuing with existing version"
    cd ..
fi

cd "$SCRIBE_DATA_DIR"

# MARK: Python / Pip

log "🐍 Checking Python environment..."
if ! command -v python3 &> /dev/null; then
    error "Python3 is not installed. Please install Python3 first."
    exit 1
fi

if ! command -v pip &> /dev/null && ! command -v pip3 &> /dev/null; then
    warning "pip not found. Attempting to download and install pip..."
    if [ ! -f "get-pip.py" ]; then
        log "Downloading get-pip.py from PyPA..."
        curl -sS https://bootstrap.pypa.io/get-pip.py -o get-pip.py || {
            error "Failed to download get-pip.py"
            exit 1
        }
    fi
    python3 get-pip.py || {
        error "Failed to install pip"
        exit 1
    }
    success "pip installed successfully"
fi

# MARK: Make Venv

log "🧪 Setting up virtual environment..."
if [ ! -d "$VENV_DIR" ]; then
    python3 -m venv "$VENV_DIR" || {
        error "Failed to create virtual environment"
        exit 1
    }
    success "Virtual environment created at $VENV_DIR"
else
    log "Using existing virtual environment at $VENV_DIR"
fi

log "🔬 Activating virtual environment..."
source "$VENV_DIR/bin/activate" || {
    error "Failed to activate virtual environment"
    exit 1
}
success "Virtual environment activated"

# MARK: Dependencies

log "📚 Installing Scribe-Data dependencies..."
pip install --upgrade pip
pip install -e . || {
    error "Failed to install Scribe-Data dependencies"
    exit 1
}
success "Dependencies installed successfully"

# MARK: Download Wikidata Dump First

DUMP_DIR="./scribe_data_wikidata_dumps_export"
DUMP_FILE="$DUMP_DIR/latest-lexemes.json.bz2"

if [ ! -f "$DUMP_FILE" ]; then
    log "📥 Downloading Wikidata lexeme dump..."
    # Auto-confirm the download prompt with "y" for the initial confirmation.
    echo "y" | scribe-data download -wdv latest || {
        error "Failed to download Wikidata dump"
        exit 1
    }
    success "Wikidata dump downloaded successfully"
else
    log "✅ Wikidata dump already exists: $DUMP_FILE"
fi

# MARK: Generate Language Data

log "⚡ Generating language data for target languages (auto-confirm)..."

# Convert arrays to space-separated strings (no quotes around the expansion).
LANG_STRING="${TARGET_LANGUAGES[*]}"
DATA_TYPES_STRING="${DATA_TYPES[*]}"

log "Languages: $LANG_STRING"
log "Data types: $DATA_TYPES_STRING"

# Calculate total number of combinations for the responses.
NUM_LANGUAGES=${#TARGET_LANGUAGES[@]}
NUM_DATA_TYPES=${#DATA_TYPES[@]}
TOTAL_COMBINATIONS=$((NUM_LANGUAGES * NUM_DATA_TYPES))

log "Total combinations to process: $TOTAL_COMBINATIONS"
log "Each combination will prompt to 'Use existing latest dump'"
log "Running: scribe-data get -l $LANG_STRING -dt $DATA_TYPES_STRING -wdp $DUMP_DIR"

# Send Down Arrow twice + Enter for each combination.
# This selects the 3rd option "Use existing latest dump".
{
    for ((i=1; i<=TOTAL_COMBINATIONS; i++)); do
        printf "\033[B\033[B\n"  # Down arrow, Down arrow, Enter
    done
} | scribe-data get -l $LANG_STRING -dt $DATA_TYPES_STRING -wdp "$DUMP_DIR"

success "Language data generated successfully"

# Sanity Check: Verify generated files.
log "🔍 Checking generated data in scribe_data_json_export..."
if [ -d "scribe_data_json_export" ]; then
    # List all generated JSON files organized by language.
    for lang in "${TARGET_LANGUAGES[@]}"; do
        if [ -d "scribe_data_json_export/$lang" ]; then
            log "📁 $lang:"
            find "scribe_data_json_export/$lang" -name "*.json" | sort | while read -r file; do
                filename=$(basename "$file")
                log "  ✅ $filename"
            done
        else
            log "⚠️  Missing directory: scribe_data_json_export/$lang"
        fi
    done

    # Count total files.
    total_files=$(find scribe_data_json_export -name "*.json" | wc -l)
    expected_files=$TOTAL_COMBINATIONS
    log "📊 Generated $total_files/$expected_files JSON files"

    if [ "$total_files" -lt "$expected_files" ]; then
        error "⚠️  Expected $expected_files files but only found $total_files"
        error "Some data may not have been generated successfully"
    fi
else
    error "scribe_data_json_export directory not found"
    exit 1
fi

# MARK: Filter Data

CONTRACTS_DIR="$PROJECT_ROOT/contracts"
log "🔍 Filtering JSON data using contracts..."

if [ ! -d "$CONTRACTS_DIR" ]; then
    warning "Contracts directory not found: $CONTRACTS_DIR"
    warning "Skipping filtering step - proceeding with unfiltered data"
    FILTERED_EXPORT_DIR="./scribe_data_json_export"  # use original data
else
    FILTERED_EXPORT_DIR="./scribe_data_json_filtered"
    mkdir -p "$FILTERED_EXPORT_DIR"

    log "Running: scribe-data fd -cd $CONTRACTS_DIR -id scribe_data_json_export -od $FILTERED_EXPORT_DIR"
    scribe-data fd -cd "$CONTRACTS_DIR" -id scribe_data_json_export -od "$FILTERED_EXPORT_DIR" || {
        error "Failed to filter JSON data"
        exit 1
    }
    success "JSON data filtered successfully"

    # Debug: Check filtered files.
    if [ -d "$FILTERED_EXPORT_DIR" ]; then
        filtered_files=$(find "$FILTERED_EXPORT_DIR" -name "*.json" | wc -l)
        log "📊 Generated $filtered_files filtered JSON files"
    fi

    # MARK: Convert Filtered Data to SQLite

    log "🗄️  Converting filtered data to SQLite format..."
    scribe-data convert -if "$FILTERED_EXPORT_DIR" -lang $LANG_STRING -dt $DATA_TYPES_STRING  -ot sqlite || {
        error "Failed to convert filtered data to SQLite format"
        exit 1
    }
    success "Filtered data converted to SQLite successfully"
fi

# MARK: Check SQLite

SQLITE_EXPORT_DIR="./scribe_data_sqlite_export"
if [ ! -d "$SQLITE_EXPORT_DIR" ]; then
    error "SQLite export directory not found: $SQLITE_EXPORT_DIR"
    exit 1
fi

SQLITE_FILES=$(find "$SQLITE_EXPORT_DIR" -name "*.sqlite" | wc -l)
if [ "$SQLITE_FILES" -eq 0 ]; then
    error "No SQLite files found in $SQLITE_EXPORT_DIR"
    exit 1
fi
log "Found $SQLITE_FILES SQLite files to copy"

# MARK: To Packs

cd "$PROJECT_ROOT"
mkdir -p "$PACKS_DIR"
log "📁 Copying SQLite files to server..."
cp -f "$TEMP_DIR/$SCRIBE_DATA_DIR/scribe_data_sqlite_export"/*.sqlite "$PACKS_DIR/" || {
    error "Failed to copy SQLite files to $PACKS_DIR"
    exit 1
}

log "Copied files:"
ls -la "$PACKS_DIR"/*.sqlite | while read -r line; do
    log "  ✅ $line"
done
success "SQLite files copied successfully"

# MARK: Migration

if [ "$SKIP_MIGRATION" != "true" ]; then
    log "🔄 Running database migration..."
    make migrate || {
        error "Migration failed"
        exit 1
    }
    success "Database migration completed successfully"
else
    log "⏭️ Skipping migration (running in CI/CD)"
fi

# MARK: Finish

log "🧹 Deactivating virtual environment..."
deactivate
success "Virtual environment deactivated"

END_TIME=$(date '+%Y-%m-%d %H:%M:%S')
success "✨ Scribe-Data update process completed successfully at $END_TIME"

log "📊 Update Summary:"
log "  • Repository: Updated/Cloned"
log "  • Virtual Environment: Reused or created at $VENV_DIR"
log "  • Dependencies: Installed"
log "  • Languages processed: ${#TARGET_LANGUAGES[@]} (${TARGET_LANGUAGES[*]})"
log "  • Data types processed: ${#DATA_TYPES[@]}"
log "  • Total combinations: $TOTAL_COMBINATIONS"
log "  • Data Generation: Completed"
log "  • SQLite Conversion: Completed"
log "  • Files Copied: $SQLITE_FILES files"
log "  • Migration: Completed"
log "  • Log file: $LOG_FILE"

# MARK: Export Stats to GitHub Actions

# This checks if we are running in GitHub Actions and writes the variables.
if [ -n "$GITHUB_OUTPUT" ]; then
    echo "Exporting stats to GitHub Output..."
    echo "LANG_COUNT=${#TARGET_LANGUAGES[@]}" >> "$GITHUB_OUTPUT"
    echo "LANG_LIST=${TARGET_LANGUAGES[*]}" >> "$GITHUB_OUTPUT"
    echo "TYPES_COUNT=${#DATA_TYPES[@]}" >> "$GITHUB_OUTPUT"
    echo "TYPES_LIST=${DATA_TYPES[*]}" >> "$GITHUB_OUTPUT"
    echo "SQLITE_COUNT=$SQLITE_FILES" >> "$GITHUB_OUTPUT"
fi

echo
success "🎉 Scribe-Data has been updated and migrated to MariaDB!"
echo
log "Next steps:"
log "  • Restart your server if needed"
log "  • Test the /data-version/:language_iso endpoints"
log "  • Check the log file for detailed information: $LOG_FILE"