diff --git "a/Week12_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\216\341\205\242\341\204\213\341\205\247\341\206\274.ipynb" "b/Week12_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\216\341\205\242\341\204\213\341\205\247\341\206\274.ipynb" new file mode 100644 index 0000000..4eb450f --- /dev/null +++ "b/Week12_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\216\341\205\242\341\204\213\341\205\247\341\206\274.ipynb" @@ -0,0 +1,2811 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "8Gxy65cu8irm" + }, + "source": [ + "##**Text generation Experiment**\n", + "\n", + "- 이번 복습과제에는 GPT-2 모델을 사용한 텍스트 생생을 다룹니다. 🙂\n", + "- GPT-2는 약 40GB의 인터넷 텍스트 데이터로 훈련된 모델로 다음 단어 예측(next word prediction)을 목적으로 학습이 되었습니다\n", + "- Beam Search, Top-k sampling, Top-p sampling 과 같은 다양한 디코딩 기법들을 실험해보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "_M2apVV-8cyb" + }, + "outputs": [], + "source": [ + "#reproducability을 위해 해당 코드를 실행해주세요\n", + "SEED = 34\n", + "#max number of words in output text\n", + "MAX_LEN = 70" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "Kd6ZRQmG8gWL" + }, + "outputs": [], + "source": [ + "# 실험할 문장입니다.\n", + "input_sequence = \"I don't know about you, but there's only one thing I want to do after a long day of work\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 447, + "referenced_widgets": [ + "2d4f1219ef25408aa0725c0a49b9ea5e", + "f31d37c927fd425e996c43e725a663ca", + "6b16506a0f8b491a9a0cb64c66773ff6", + "abd181c4d3ed4eca9c5269d75775c833", + "ac9cad3163ca4f929351a83cbe2edeb4", + "5a438f5d61ec44e49989e5612e0c06da", + "81c2fbac07ad44ffb311b6ac66743ed7", + "89fa32bdca4a48e3ba414feedfcb04a2", + "60a4e1d50fd444e1ae57eab72271abca", + "4fd5dff2c25a4480a2174ae1d81034f2", + "73e5adfbe1534c4190c9b8c91a572086", + "2a6b2b09f17f49969b67e4620459bb23", + "c9f766dda0da4c0194c0ddcd0cd61919", + "52a7a4e16916470294f2e7eb1fffdffc", + "efa6f9fc9a3b48b1a420726468e5fd3b", + "5e81b02a5622493199f44d098a34d2d4", + "80ff4590a7ec4a7eafb72dc173e04e48", + "c29b119b2b8343e5b12e80792480ed99", + "57e3af88b7fc4ec094429df84d12122d", + "26f6ef105a964324a84800b82754bd93", + "d700bf4777044b609c302ce42d369e1c", + "f04b4cf625dc47f099f648251146b4bf", + "705d739638ac44818056709a166bdad1", + "c0ab53a39af24f068900f669db515a05", + "3b655421119a4d0e945bc4f5e38dab01", + "03cc758ab39a4a0b822c14cb46f433b2", + "c2f091c1b7bc4a6480388a4c585baf5b", + "5d6fbbc06ce049578d1c300767dfc5c6", + "21e7522ed55a40ef8d29142acc3d069d", + "c0ce81d32c0047968528dc89bbf3580b", + "d233c8c5fb8648a0a63aae44f92657a2", + "7d2890d7ce574771b28b0e595c8fe232", + "09f42b6f43b44205bf9730cd77cda528", + "d89a8e0780544add9746bf1ea2e673d9", + "3acec24844b34e09a016e64f26eafcf6", + "b4603ba94d6346c8b9e39f08d038c3ae", + "ab0e43b3c36943a5bd9ccf3f4a3c9ede", + "bbbfdacb13dd40c2aa1d6f3fd4c59ef7", + "e651f62916f24655b722ed452b1062e4", + "198f2dd8e7aa430cb14a09476867c950", + "f95cb57979074d0f91bf769d6d37cd90", + "5eb218303fb4438c9bb64d4440d030ef", + "41aa17ebc83e464c871a506f231eba30", + "a4ac57284b044e0a963a0d6c751e1858", + "03791febac134be9bcd6b378127fb7a5", + "d212d272ab1942a7a875799788da02df", + "3b4a076a5e1041388bf285d9e05061d2", + "df5b37307e2443d0a263966e1f91133b", + "ab66fec2dc584fd1a369bd4aa4c8ee9d", + "d0322248bf514595a619b2dbde524697", + "59148ba271f147599d5ad1a1a3ca8712", + "8d0dc89c154c4fec9d04f5794abd20f3", + "35f506c453dc4f0f9e42dae9d2487c7d", + "216a04bb3a4848dfaad2920b827b4b6e", + "127cffac102a48f9a0fa0ff6f111a5cf", + "cca618e303b34611a77bc54261f9c785", + "56b0c404027b4d8da66794d29a5f7696", + "30e77991e34b462ab4f369c70d6f7ebc", + "1c7e8cd2ced64a65807c95407deb0cb9", + "130853ac235e4a0a95ad94afa92eab44", + "6235c74c67304ad79649955b9bb56815", + "5ae64d63f72e4fa199526bbf5fbf20d6", + "690bd98f6dc9409e87d4466aed4485f6", + "b8646219a56344ef82985203817b6d24", + "09d272d68a564dea9537fa87e20247c1", + "94d886400a414791bf5ee907ff943def" + ] + }, + "id": "pEjO6IVs8gS0", + "outputId": "f99695d5-671d-4dc6-9a04-76e1193d0b83" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "883036e8c13548dfa9b109e374b969c5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/26.0 [00:00 즉, 각 타임스텝 𝑡마다 조건부 확률이 가장 높은 단어를 선택하는 것!\n", + "\n", + "\n", + "- 이 단순한 접근방식이 어떤 성능 차이를 보이는지 살펴봅시다." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ig-oWtIA8gIq", + "outputId": "0ce6a21d-f716-458a-9354-5be570396653" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output:\n", + "----------------------------------------------------------------------------------------------------\n", + "I don't know about you, but there's only one thing I want to do after a long day of work: go to the gym.\n", + "\n", + "I'm not talking about the gym that's right next to my house. I'm talking about\n" + ] + } + ], + "source": [ + "context = \"I don't know about you, but there's only one thing I want to do after a long day of work\"\n", + "\n", + "# context를 encoder해주세요\n", + "input_ids = tokenizer.encode(context, return_tensors=\"pt\")\n", + "\n", + "# 텍스트 생성하기, 이때 output length가 (context length 포함) 50이 될 때까지\n", + "greedy_output = GPT2.generate(input_ids, max_length=50)\n", + "\n", + "# output sequences 출력하기\n", + "print(\"Output:\\n\" + 100 * '-')\n", + "print(tokenizer.decode(greedy_output[0], skip_special_tokens = True))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gVj1neC__f2N" + }, + "source": [ + "💡**위 Greedy Search 식과 코드 결과를 보고 고려되는 주요 문제점을 해당 셀을 풀고 설명해주세요.**\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "- 지역 최적화(Local Optimum): 매 스텝 최대 확률만을 따르다 보니, 전체 문맥을 고려한 더 좋은 시퀀스를 놓치기 쉬움.\n", + "- 반복성·단조로움 : 동일한 확률 상위 토큰이 계속 선택되어, 텍스트가 획일적이고 예측 가능해짐." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3EC0shCGAAQq" + }, + "source": [ + "### **Beam Search + N-Gram Penalty**\n", + "- Beam Search는 기본적으로 Greedy Search와 유사하지만, 모델이 각 시점에서 여러 개(num_beams)의 후보 경로를 동시에 추적한다는 점이 다릅니다\n", + " > 즉, 모델이 여러 대안을 비교하면서 텍스트를 생성할 수 있다는 점!\n", + "\n", + "\n", + "- 또한, n-gram 반복을 방지하기 위한 패널티도 적용할 수 있습니다.예를 들어 `no_repeat_ngram_size = 2`로 설정하면\n", + "동일한 2-그램이 두 번 등장하지 않도록 제한됩니다.\n", + "\n", + "- 그리고 `num_return_sequences = 5` 로 설정하면\n", + "5개의 beam 결과를 모두 출력하여 비교해볼 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "l6OrEzA684Np", + "outputId": "392c9796-8cdb-4c9d-9f5a-32eae3462597" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Output:\n", + "----------------------------------------------------------------------------------------------------\n", + "0: I don't know about you, but there's only one thing I want to do after a long day of work, and that's to go home and watch a movie.\"\n", + "\n", + "\"I don't know about you, but there's only one\n", + "1: I don't know about you, but there's only one thing I want to do after a long day of work, and that's to go home and watch a movie.\"\n", + "\n", + "\"I don't know about you, but I don't want\n", + "2: I don't know about you, but there's only one thing I want to do after a long day of work, and that's to go home and watch a movie.\"\n", + "\n", + "\"I don't know about you, but I want to go\n", + "3: I don't know about you, but there's only one thing I want to do after a long day of work, and that's to go home and watch a movie.\"\n", + "\n", + "\"I don't know about you, but I'd like to\n", + "4: I don't know about you, but there's only one thing I want to do after a long day of work, and that's to go home and watch a movie.\"\n", + "\n", + "\"I don't know about you, but I don't think\n" + ] + } + ], + "source": [ + "# Beam Search를 사용하려면,단순히 generate 함수의 몇몇 파라미터만 변경하면 됩니다.\n", + "# num_beans를 설정해서 beam search decoding을 실행해주세요\n", + "beam_outputs = GPT2.generate(\n", + " input_ids, \n", + " max_length=50, \n", + " num_beams=5, \n", + " num_return_sequences=5, \n", + " early_stopping=True\n", + ")\n", + "\n", + "\n", + "print('')\n", + "print(\"Output:\\n\" + 100 * '-')\n", + "\n", + "# output sequences 출력하기\n", + "for i, beam_output in enumerate(beam_outputs):\n", + " print(\"{}: {}\".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_VhLZdJlBVZk" + }, + "source": [ + "💡**아래 그래프는 Beam Search의 결과와 실제 인간의 말하기 방식 사이의 차이를 보여줍니다. 위 Beam Search 코드 결과와 아래 그래프를 보고 고려되는 주요 문제점을 해당 셀을 풀고 설명해주세요. (기재된 논문에서 힌트를 찾을 수 있습니다.)**\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "- Beam Search는 매 timestep마다 현재까지 누적 확률이 높은 상위 K개의 후보 집합만을 유지하며 다음 토큰을 선택하기 때문에, 확률이 낮아 보이지만 뒤쪽 문맥을 고려하면 더 자연스럽게 이어질 수 있는 토큰을 놓칠 가능성이 크다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOBGUk2aAwQ-" + }, + "source": [ + "![image.png]()\n", + "\n", + "[출처] The Curious Case of Neural Text Degeneration, arXiv:1904.09751 (cs)\n", + "https://arxiv.org/abs/1904.09751" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BcDagIp1BvFA" + }, + "source": [ + "### **Basic Sampling**\n", + "- 이 방식은 가장 확률이 높은 문장을 찾는 경로를 고집하지 않고, 각 시점에서 조건부 확률 분포에 따라 무작위로 다음 단어를 선택합니다.\n", + "\n", + "$w t​ ∼P(w∣w 1:t−1)$\n", + "- 하지만 이렇게 무작위성이 추가되면, 생성된 문장이 일관성이 떨어지고 혼란스러워질 수 있습니다.\n", + "- 그래서 무작위성을 제어하기 위해 temperature 파라미터를 도입할 수 있습니다. 이 파라미터는 확률이 높은 단어의 선택 가능성을 높이고, 확률이 낮은 단어는 선택될 가능성을 줄여줍니다." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z6pXSH2RBuz8", + "outputId": "a4b7f4f2-ca61-4714-a194-2dce3b483ad0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output:\n", + "----------------------------------------------------------------------------------------------------\n", + "I don't know about you, but there's only one thing I want to do after a long day of work. I want to take a nice nap.\"\n", + "\n", + "I was a bit surprised. Now I was to have her fall asleep on me\n" + ] + } + ], + "source": [ + "# 샘플링을 구현하려면 do_sample = True만 설정하면 됩니다.\n", + "# temperature을 설정해주세요.\n", + "# 이때 top_k = 0으로 설정해주세요.\n", + "sample_output = GPT2.generate(\n", + " input_ids,\n", + " max_length=50,\n", + " do_sample=True,\n", + " temperature=0.7,\n", + " top_k=0\n", + ")\n", + "\n", + "# output sequences 출력하기\n", + "print(\"Output:\\n\" + 100 * '-')\n", + "print(tokenizer.decode(sample_output[0], skip_special_tokens = True))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8g2RrY7PFmjJ" + }, + "source": [ + "💡**temperature 파라미터가 어떤 매커니즘으로 무작위성을 제어하는지 해당 셀을 풀고 설명해주세요.**\n", + "\n", + "\n", + "---\n", + "\n", + "- Temperature는 로짓(logits)에 대한 스케일(scale)을 조정하여,\n", + "낮게 설정(T<1)하면 확률 분포가 더욱 뾰족(peaky) 해져서, 무작위성이 줄어들고, 높게 설정(T>1)하면 확률 분포가 평탄(flat) 해져서, 무작위성이 증가한다. 따라서 T값을 조정함으로써 샘플링 시의 불확실성을 조절할 수 있다.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RzmrRsA8CmYs" + }, + "source": [ + "### **Top-k Sampling**\n", + "- Top-K 샘플링에서는 다음 단어 후보 중 확률이 가장 높은 상위 K개 단어만 선택하고,\n", + "전체 probability mass을 이 K개의 단어에만 분배합니다.\n", + "\n", + "> 즉, 확률이 높은 단어의 선택 확률을 높이고, 낮은 단어의 확률을 줄이는 방식이 아니라,아예 확률이 낮은 단어들을 완전히 제거하는 방식!" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WA-og6IeD1BZ", + "outputId": "e279dbbc-c4e1-411a-ca49-8c0952000b61" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output:\n", + "----------------------------------------------------------------------------------------------------\n", + "I don't know about you, but there's only one thing I want to do after a long day of work is to make more money! And in fact that's pretty much what happened just two weeks ago when I made the following calculation.\n", + " ...\n" + ] + } + ], + "source": [ + "# top_k 값을 설정해서, 조건부 확률 분포에서 고려할 상위 단어 개수(K)를 지정해주세요!\n", + "sample_output = GPT2.generate(\n", + " input_ids,\n", + " max_length=50,\n", + " do_sample=True,\n", + " top_k=50\n", + ")\n", + "\n", + "# output sequences 출력하기\n", + "print(\"Output:\\n\" + 100 * '-')\n", + "print(tokenizer.decode(sample_output[0], skip_special_tokens = True), '...')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2CgUegJOAw6h" + }, + "source": [ + "### **Top-P Sampling(Nucleus Sampling)**\n", + "- Top-K 샘플링은 이전의 random sampling보다 더 일관된 텍스트를 생성하는 것으로 보입니다. 하지만 이보다 더나은 방법으로 Top-p sampling이 있습니다.\n", + "- Top-P 샘플링은 Top-K와 유사하지만,가장 확률이 높은 상위 K개 단어를 고르는 대신,누적 확률이 P 이상이 되는 최소한의 단어 집합을 선택합니다 그리고 전체 probability mass는 이 단어 집합에 재분배됩니다.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GEhy8PgbAr2f", + "outputId": "aac06a08-21b7-4e78-a231-a4e29709b343" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output:\n", + "----------------------------------------------------------------------------------------------------\n", + "I don't know about you, but there's only one thing I want to do after a long day of work.\n", + "\n", + "I want to eat a big meal and relax on the couch.\n", + "\n", + "I want to read a good book and just ...\n" + ] + } + ], + "source": [ + "# top_p 파라미터를 통해 only from 80% most likely words 만 sample 해주세요.\n", + "sample_output = GPT2.generate(\n", + " input_ids,\n", + " max_length=50,\n", + " do_sample=True,\n", + " top_p=0.8\n", + ")\n", + "\n", + "# output sequences 출력하기\n", + "print(\"Output:\\n\" + 100 * '-')\n", + "print(tokenizer.decode(sample_output[0], skip_special_tokens = True), '...')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "heGKePrAE46H" + }, + "source": [ + "### **Top-K + Top-P sampling**\n", + "- 둘을 동시에 사용하면, 확률이 매우 낮은 단어(이상한 단어)가 나올 가능성을 줄이면서도, 선택되는 단어 집합의 크기는 유동적으로 유지할 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Q8-CnW76E3FI", + "outputId": "8d547cbd-53ac-4710-d81b-6162649eddcf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output:\n", + "----------------------------------------------------------------------------------------------------\n", + "0: I don't know about you, but there's only one thing I want to do after a long day of work: go to the beach. And I don't mean just a little bit of sand, I mean a whole lot of sand. I want to be out in the ocean. I want to go to the beach, and I want to get all of my sunburned out.\n", + "\n", + "So I bought a beach umbrella. It's made from a rubberized fabric that's easy to wash, and it's the perfect size for me. It's about the size of my hand, and I can't really get it all the way around to the front. I'm not sure why...\n", + "\n", + "1: I don't know about you, but there's only one thing I want to do after a long day of work: sit down and watch a movie. I've got a few favorites.\n", + "\n", + "1. The Good, the Bad, and the Ugly (1942)\n", + "\n", + "The Good, the Bad, and the Ugly is one of the most famous films of all time, and it's also one of the most underrated. It's a film about a group of misfits who are sent to a war-torn village in order to help out the local population. The Good, the Bad, and the Ugly is a film about the human condition, and it's also about...\n", + "\n", + "2: I don't know about you, but there's only one thing I want to do after a long day of work: go to a movie. I'm a big fan of the '80s, and the '90s, and the 2000s, and the 2010s. I'm a huge fan of the '70s, and the '80s, and the '90s.\n", + "\n", + "You know, I just watched a documentary on the '90s. I don't know what it's called, but it's about the '90s. It's called \"The Rise of the New Millennium.\" It's a great documentary. I just want to see a movie that's about...\n", + "\n", + "3: I don't know about you, but there's only one thing I want to do after a long day of work, and that's to watch the game on TV. I'm not the only one. I know a lot of people that watch the game, and they're all like, 'Man, I don't want to do this anymore. I don't want to do this anymore.' And I'm like, 'Well, that's not gonna happen. I'm gonna do this.'\"\n", + "\n", + "Brock's \"The Biggest Loser\" was canceled after the show's fourth season.\n", + "\n", + "Brock's final season of \"The Biggest Loser\" was cut short after the...\n", + "\n", + "4: I don't know about you, but there's only one thing I want to do after a long day of work, and that's to sit back and relax.\"\n", + "\n", + "\"Well, I guess that's a good thing,\" said Weiss, nodding. \"I'm glad you're here.\"\n", + "\n", + "\"Yeah, I know. I'm glad you're here.\"\n", + "\n", + "\"Yeah, I'm glad you're here.\"\n", + "\n", + "\"Yeah, I'm glad you're here.\"\n", + "\n", + "\"Yeah, I'm glad you're here.\"\n", + "\n", + "\"Yeah, I'm glad you're here.\"\n", + "\n", + "\"Yeah, I'm glad you're here.\"\n", + "\n", + "\"Yeah, I...\n", + "\n" + ] + } + ], + "source": [ + "# top_k와 top_p에 값을 지정하면 되고, temperature 파라미터도 함께 사용할 수 있습니다.\n", + "# 아래 코드를 완성해주세요.\n", + "# 이때 max_length= 2*MAX_LEN 으로 설정해주세요\n", + "sample_outputs = GPT2.generate(\n", + " input_ids,\n", + " max_length=2 * MAX_LEN,\n", + " do_sample=True,\n", + " top_k=50,\n", + " top_p=0.8,\n", + " temperature=0.7,\n", + " num_return_sequences=5\n", + ")\n", + "# output sequences 출력하기\n", + "print(\"Output:\\n\" + 100 * '-')\n", + "for i, sample_output in enumerate(sample_outputs):\n", + " print(\"{}: {}...\".format(i, tokenizer.decode(sample_output, skip_special_tokens = True)))\n", + " print('')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s_TeJ5zXF6Ra" + }, + "source": [ + "💡**Top-k와 Top-p의 방식의 차이에 대해 설명해주세요**\n", + "\n", + "\n", + "---\n", + "## Top-k\n", + "\n", + "- 항상 확률 상위 *k*개의 토큰만 남기고 샘플링한다\n", + "\n", + "- **특징** \n", + " - 후보 집합 크기가 **고정**되어 있어, 계산량을 예측하기 쉽다 \n", + " - 그러나 확률이 낮더라도 상위 *k* 안에 들어가 있으면 샘플링 대상이 되므로, 문맥에 어울리지 않는 무작위성이 발생할 수 있다\n", + "\n", + "---\n", + "\n", + "## Top-p \n", + "\n", + "- 조건부 확률 분포를 내림차순으로 정렬한 뒤, 누적 확률이 *p*가 될 때까지 상위 토큰들을 모아 그 안에서 샘플링한다\n", + "\n", + "- **특징** \n", + " - 후보 집합 크기가 문맥에 따라 **동적으로 변동**한다 \n", + " - 문맥에 알맞은 토큰들만 고려하여 무작위성을 조절하기 때문에, 더 자연스럽고 안정적인 출력을 얻을 수 있다\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "03791febac134be9bcd6b378127fb7a5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d212d272ab1942a7a875799788da02df", + "IPY_MODEL_3b4a076a5e1041388bf285d9e05061d2", + "IPY_MODEL_df5b37307e2443d0a263966e1f91133b" + ], + "layout": "IPY_MODEL_ab66fec2dc584fd1a369bd4aa4c8ee9d" + } + }, + "03cc758ab39a4a0b822c14cb46f433b2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7d2890d7ce574771b28b0e595c8fe232", + "placeholder": "​", + "style": "IPY_MODEL_09f42b6f43b44205bf9730cd77cda528", + "value": " 456k/456k [00:00<00:00, 20.9MB/s]" + } + }, + "09d272d68a564dea9537fa87e20247c1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "09f42b6f43b44205bf9730cd77cda528": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "127cffac102a48f9a0fa0ff6f111a5cf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "130853ac235e4a0a95ad94afa92eab44": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "198f2dd8e7aa430cb14a09476867c950": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1c7e8cd2ced64a65807c95407deb0cb9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_09d272d68a564dea9537fa87e20247c1", + "placeholder": "​", + "style": "IPY_MODEL_94d886400a414791bf5ee907ff943def", + "value": " 3.25G/3.25G [00:44<00:00, 35.2MB/s]" + } + }, + "216a04bb3a4848dfaad2920b827b4b6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "21e7522ed55a40ef8d29142acc3d069d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "26f6ef105a964324a84800b82754bd93": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2a6b2b09f17f49969b67e4620459bb23": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c9f766dda0da4c0194c0ddcd0cd61919", + "IPY_MODEL_52a7a4e16916470294f2e7eb1fffdffc", + "IPY_MODEL_efa6f9fc9a3b48b1a420726468e5fd3b" + ], + "layout": "IPY_MODEL_5e81b02a5622493199f44d098a34d2d4" + } + }, + "2d4f1219ef25408aa0725c0a49b9ea5e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f31d37c927fd425e996c43e725a663ca", + "IPY_MODEL_6b16506a0f8b491a9a0cb64c66773ff6", + "IPY_MODEL_abd181c4d3ed4eca9c5269d75775c833" + ], + "layout": "IPY_MODEL_ac9cad3163ca4f929351a83cbe2edeb4" + } + }, + "30e77991e34b462ab4f369c70d6f7ebc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_690bd98f6dc9409e87d4466aed4485f6", + "max": 3247159078, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b8646219a56344ef82985203817b6d24", + "value": 3247159078 + } + }, + "35f506c453dc4f0f9e42dae9d2487c7d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3acec24844b34e09a016e64f26eafcf6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e651f62916f24655b722ed452b1062e4", + "placeholder": "​", + "style": "IPY_MODEL_198f2dd8e7aa430cb14a09476867c950", + "value": "tokenizer.json: 100%" + } + }, + "3b4a076a5e1041388bf285d9e05061d2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8d0dc89c154c4fec9d04f5794abd20f3", + "max": 666, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_35f506c453dc4f0f9e42dae9d2487c7d", + "value": 666 + } + }, + "3b655421119a4d0e945bc4f5e38dab01": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c0ce81d32c0047968528dc89bbf3580b", + "max": 456318, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d233c8c5fb8648a0a63aae44f92657a2", + "value": 456318 + } + }, + "41aa17ebc83e464c871a506f231eba30": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4fd5dff2c25a4480a2174ae1d81034f2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "52a7a4e16916470294f2e7eb1fffdffc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57e3af88b7fc4ec094429df84d12122d", + "max": 1042301, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_26f6ef105a964324a84800b82754bd93", + "value": 1042301 + } + }, + "56b0c404027b4d8da66794d29a5f7696": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6235c74c67304ad79649955b9bb56815", + "placeholder": "​", + "style": "IPY_MODEL_5ae64d63f72e4fa199526bbf5fbf20d6", + "value": "model.safetensors: 100%" + } + }, + "57e3af88b7fc4ec094429df84d12122d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59148ba271f147599d5ad1a1a3ca8712": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5a438f5d61ec44e49989e5612e0c06da": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5ae64d63f72e4fa199526bbf5fbf20d6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5d6fbbc06ce049578d1c300767dfc5c6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5e81b02a5622493199f44d098a34d2d4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5eb218303fb4438c9bb64d4440d030ef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "60a4e1d50fd444e1ae57eab72271abca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6235c74c67304ad79649955b9bb56815": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "690bd98f6dc9409e87d4466aed4485f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6b16506a0f8b491a9a0cb64c66773ff6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_89fa32bdca4a48e3ba414feedfcb04a2", + "max": 26, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_60a4e1d50fd444e1ae57eab72271abca", + "value": 26 + } + }, + "705d739638ac44818056709a166bdad1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c0ab53a39af24f068900f669db515a05", + "IPY_MODEL_3b655421119a4d0e945bc4f5e38dab01", + "IPY_MODEL_03cc758ab39a4a0b822c14cb46f433b2" + ], + "layout": "IPY_MODEL_c2f091c1b7bc4a6480388a4c585baf5b" + } + }, + "73e5adfbe1534c4190c9b8c91a572086": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7d2890d7ce574771b28b0e595c8fe232": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "80ff4590a7ec4a7eafb72dc173e04e48": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "81c2fbac07ad44ffb311b6ac66743ed7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "89fa32bdca4a48e3ba414feedfcb04a2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8d0dc89c154c4fec9d04f5794abd20f3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "94d886400a414791bf5ee907ff943def": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a4ac57284b044e0a963a0d6c751e1858": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ab0e43b3c36943a5bd9ccf3f4a3c9ede": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_41aa17ebc83e464c871a506f231eba30", + "placeholder": "​", + "style": "IPY_MODEL_a4ac57284b044e0a963a0d6c751e1858", + "value": " 1.36M/1.36M [00:00<00:00, 34.4MB/s]" + } + }, + "ab66fec2dc584fd1a369bd4aa4c8ee9d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "abd181c4d3ed4eca9c5269d75775c833": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4fd5dff2c25a4480a2174ae1d81034f2", + "placeholder": "​", + "style": "IPY_MODEL_73e5adfbe1534c4190c9b8c91a572086", + "value": " 26.0/26.0 [00:00<00:00, 1.16kB/s]" + } + }, + "ac9cad3163ca4f929351a83cbe2edeb4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b4603ba94d6346c8b9e39f08d038c3ae": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f95cb57979074d0f91bf769d6d37cd90", + "max": 1355256, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5eb218303fb4438c9bb64d4440d030ef", + "value": 1355256 + } + }, + "b8646219a56344ef82985203817b6d24": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bbbfdacb13dd40c2aa1d6f3fd4c59ef7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c0ab53a39af24f068900f669db515a05": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5d6fbbc06ce049578d1c300767dfc5c6", + "placeholder": "​", + "style": "IPY_MODEL_21e7522ed55a40ef8d29142acc3d069d", + "value": "merges.txt: 100%" + } + }, + "c0ce81d32c0047968528dc89bbf3580b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c29b119b2b8343e5b12e80792480ed99": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c2f091c1b7bc4a6480388a4c585baf5b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c9f766dda0da4c0194c0ddcd0cd61919": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_80ff4590a7ec4a7eafb72dc173e04e48", + "placeholder": "​", + "style": "IPY_MODEL_c29b119b2b8343e5b12e80792480ed99", + "value": "vocab.json: 100%" + } + }, + "cca618e303b34611a77bc54261f9c785": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_56b0c404027b4d8da66794d29a5f7696", + "IPY_MODEL_30e77991e34b462ab4f369c70d6f7ebc", + "IPY_MODEL_1c7e8cd2ced64a65807c95407deb0cb9" + ], + "layout": "IPY_MODEL_130853ac235e4a0a95ad94afa92eab44" + } + }, + "d0322248bf514595a619b2dbde524697": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d212d272ab1942a7a875799788da02df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d0322248bf514595a619b2dbde524697", + "placeholder": "​", + "style": "IPY_MODEL_59148ba271f147599d5ad1a1a3ca8712", + "value": "config.json: 100%" + } + }, + "d233c8c5fb8648a0a63aae44f92657a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d700bf4777044b609c302ce42d369e1c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d89a8e0780544add9746bf1ea2e673d9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3acec24844b34e09a016e64f26eafcf6", + "IPY_MODEL_b4603ba94d6346c8b9e39f08d038c3ae", + "IPY_MODEL_ab0e43b3c36943a5bd9ccf3f4a3c9ede" + ], + "layout": "IPY_MODEL_bbbfdacb13dd40c2aa1d6f3fd4c59ef7" + } + }, + "df5b37307e2443d0a263966e1f91133b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_216a04bb3a4848dfaad2920b827b4b6e", + "placeholder": "​", + "style": "IPY_MODEL_127cffac102a48f9a0fa0ff6f111a5cf", + "value": " 666/666 [00:00<00:00, 67.1kB/s]" + } + }, + "e651f62916f24655b722ed452b1062e4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "efa6f9fc9a3b48b1a420726468e5fd3b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d700bf4777044b609c302ce42d369e1c", + "placeholder": "​", + "style": "IPY_MODEL_f04b4cf625dc47f099f648251146b4bf", + "value": " 1.04M/1.04M [00:00<00:00, 12.3MB/s]" + } + }, + "f04b4cf625dc47f099f648251146b4bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f31d37c927fd425e996c43e725a663ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5a438f5d61ec44e49989e5612e0c06da", + "placeholder": "​", + "style": "IPY_MODEL_81c2fbac07ad44ffb311b6ac66743ed7", + "value": "tokenizer_config.json: 100%" + } + }, + "f95cb57979074d0f91bf769d6d37cd90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/[Week 12].pdf b/[Week 12].pdf new file mode 100644 index 0000000..d599794 Binary files /dev/null and b/[Week 12].pdf differ diff --git a/wk12.ipynb b/wk12.ipynb new file mode 100644 index 0000000..ace2cf4 --- /dev/null +++ b/wk12.ipynb @@ -0,0 +1,1129 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "import math\n", + "import torch\n", + "from torch import nn\n", + "from matplotlib import pyplot as plt\n", + "\n", + "class PositionalEncoding(nn.Module):\n", + " def __init__(self, d_model, max_len, dropout=0.1):\n", + " super().__init__()\n", + " self.dropout = nn.Dropout(p=dropout)\n", + " position = torch.arange(max_len).unsqueeze(1) # (max_len,1)\n", + " div_term = torch.exp(\n", + " torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)\n", + " ) # (d_model/2,)\n", + "\n", + " pe = torch.zeros(max_len, 1, d_model) # (max_len,1,d_model)\n", + " pe[:, 0, 0::2] = torch.sin(position * div_term)\n", + " pe[:, 0, 1::2] = torch.cos(position * div_term)\n", + " self.register_buffer(\"pe\", pe)\n", + "\n", + " def forward(self, x):\n", + " x = x + self.pe[: x.size(0)]\n", + " return self.dropout(x)\n", + "\n", + "# 인스턴스 생성\n", + "encoding = PositionalEncoding(d_model=128, max_len=50)\n", + "\n", + "# 그래디언트 분리 → CPU 이동 → squeeze → 파이썬 리스트 변환\n", + "arr = encoding.pe.detach().cpu().squeeze(1).tolist() # shape: [50][128] 리스트\n", + "\n", + "# 그리기\n", + "plt.figure(figsize=(8, 4))\n", + "plt.pcolormesh(arr, cmap=\"RdBu\")\n", + "plt.xlabel(\"Embedding Dimension\")\n", + "plt.xlim((0, 128))\n", + "plt.ylabel(\"Position\")\n", + "plt.colorbar(label=\"Encoding value\")\n", + "plt.title(\"Positional Encoding Heatmap\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the\n", + "full pipeline package name 'de_core_news_sm' instead.\u001b[0m\n", + "Collecting de-core-news-sm==3.8.0\n", + " Using cached https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)\n", + "Installing collected packages: de-core-news-sm\n", + "Successfully installed de-core-news-sm-3.8.0\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('de_core_news_sm')\n", + "\u001b[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the\n", + "full pipeline package name 'en_core_web_sm' instead.\u001b[0m\n", + "Collecting en-core-web-sm==3.8.0\n", + " Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)\n", + "Installing collected packages: en-core-web-sm\n", + "Successfully installed en-core-web-sm-3.8.0\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('en_core_web_sm')\n" + ] + } + ], + "source": [ + "!python -m spacy download de\n", + "!python -m spacy download en" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "German vocab size: 19214\n", + "English vocab size: 10837\n" + ] + } + ], + "source": [ + "import requests\n", + "import gzip\n", + "import io\n", + "from torchtext.data.utils import get_tokenizer\n", + "from torchtext.vocab import build_vocab_from_iterator\n", + "\n", + "# 1) 올바른 raw URL\n", + "URL_DE = \"https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.de.gz\"\n", + "URL_EN = \"https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.en.gz\"\n", + "\n", + "def download_and_read_gz(url):\n", + " resp = requests.get(url)\n", + " resp.raise_for_status()\n", + " # 메모리 상에서 gzip 해제\n", + " with gzip.GzipFile(fileobj=io.BytesIO(resp.content)) as gz:\n", + " return [line.decode(\"utf-8\").strip() for line in gz]\n", + "\n", + "# 2) 독일어·영어 문장 리스트 로드\n", + "de_sentences = download_and_read_gz(URL_DE)\n", + "en_sentences = download_and_read_gz(URL_EN)\n", + "raw_train = list(zip(de_sentences, en_sentences)) # [(de, en), ...]\n", + "\n", + "# 3) 토크나이저·특수토큰 세팅\n", + "SRC, TGT = \"de\", \"en\"\n", + "UNK_IDX = 0\n", + "specials = [\"\", \"\", \"\", \"\"]\n", + "tokenizer = {\n", + " SRC: get_tokenizer(\"spacy\", language=\"de_core_news_sm\"),\n", + " TGT: get_tokenizer(\"spacy\", language=\"en_core_web_sm\"),\n", + "}\n", + "\n", + "# 4) 토큰 제너레이터\n", + "def generate_tokens(data, lang):\n", + " idx = 0 if lang == SRC else 1\n", + " for src, tgt in data:\n", + " text = src if lang == SRC else tgt\n", + " yield tokenizer[lang](text)\n", + "\n", + "# 5) vocab 빌드\n", + "vocab = {}\n", + "for lang in [SRC, TGT]:\n", + " v = build_vocab_from_iterator(\n", + " generate_tokens(raw_train, lang),\n", + " min_freq=1,\n", + " specials=specials,\n", + " special_first=True,\n", + " )\n", + " v.set_default_index(UNK_IDX)\n", + " vocab[lang] = v\n", + "\n", + "# 확인\n", + "print(\"German vocab size:\", len(vocab[\"de\"]))\n", + "print(\"English vocab size:\", len(vocab[\"en\"]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "import torch\n", + "from torch import nn\n", + "\n", + "\n", + "class PositionalEncoding(nn.Module):\n", + " def __init__(self, d_model, max_len, dropout=0.1):\n", + " super().__init__()\n", + " self.dropout = nn.Dropout(p=dropout)\n", + "\n", + " position = torch.arange(max_len).unsqueeze(1)\n", + " div_term = torch.exp(\n", + " torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)\n", + " )\n", + "\n", + " pe = torch.zeros(max_len, 1, d_model)\n", + " pe[:, 0, 0::2] = torch.sin(position * div_term)\n", + " pe[:, 0, 1::2] = torch.cos(position * div_term)\n", + " self.register_buffer(\"pe\", pe)\n", + "\n", + " def forward(self, x):\n", + " x = x + self.pe[: x.size(0)]\n", + " return self.dropout(x)\n", + "\n", + "\n", + "class TokenEmbedding(nn.Module):\n", + " def __init__(self, vocab_size, emb_size):\n", + " super().__init__()\n", + " self.embedding = nn.Embedding(vocab_size, emb_size)\n", + " self.emb_size = emb_size\n", + "\n", + " def forward(self, tokens):\n", + " return self.embedding(tokens.long()) * math.sqrt(self.emb_size)\n", + "\n", + "\n", + "class Seq2SeqTransformer(nn.Module):\n", + " def __init__(\n", + " self,\n", + " num_encoder_layers,\n", + " num_decoder_layers,\n", + " emb_size,\n", + " max_len,\n", + " nhead,\n", + " src_vocab_size,\n", + " tgt_vocab_size,\n", + " dim_feedforward,\n", + " dropout=0.1,\n", + " ):\n", + " super().__init__()\n", + " self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)\n", + " self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)\n", + " self.positional_encoding = PositionalEncoding(\n", + " d_model=emb_size, max_len=max_len, dropout=dropout\n", + " )\n", + " self.transformer = nn.Transformer(\n", + " d_model=emb_size,\n", + " nhead=nhead,\n", + " num_encoder_layers=num_encoder_layers,\n", + " num_decoder_layers=num_decoder_layers,\n", + " dim_feedforward=dim_feedforward,\n", + " dropout=dropout,\n", + " )\n", + " self.generator = nn.Linear(emb_size, tgt_vocab_size)\n", + "\n", + " def forward(\n", + " self,\n", + " src,\n", + " trg,\n", + " src_mask,\n", + " tgt_mask,\n", + " src_padding_mask,\n", + " tgt_padding_mask,\n", + " memory_key_padding_mask,\n", + " ):\n", + " src_emb = self.positional_encoding(self.src_tok_emb(src))\n", + " tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))\n", + " outs = self.transformer(\n", + " src=src_emb,\n", + " tgt=tgt_emb,\n", + " src_mask=src_mask,\n", + " tgt_mask=tgt_mask,\n", + " memory_mask=None,\n", + " src_key_padding_mask=src_padding_mask,\n", + " tgt_key_padding_mask=tgt_padding_mask,\n", + " memory_key_padding_mask=memory_key_padding_mask\n", + " )\n", + " return self.generator(outs)\n", + "\n", + " def encode(self, src, src_mask):\n", + " return self.transformer.encoder(\n", + " self.positional_encoding(self.src_tok_emb(src)), src_mask\n", + " )\n", + "\n", + " def decode(self, tgt, memory, tgt_mask):\n", + " return self.transformer.decoder(\n", + " self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "src_tok_emb\n", + "└ embedding\n", + "tgt_tok_emb\n", + "└ embedding\n", + "positional_encoding\n", + "└ dropout\n", + "transformer\n", + "└ encoder\n", + "│ └ layers\n", + "│ │ └ 0\n", + "│ │ └ 1\n", + "│ │ └ 2\n", + "│ └ norm\n", + "└ decoder\n", + "│ └ layers\n", + "│ │ └ 0\n", + "│ │ └ 1\n", + "│ │ └ 2\n", + "│ └ norm\n", + "generator\n" + ] + } + ], + "source": [ + "# --- 앞에서 build한 vocab 딕셔너리 ---\n", + "# vocab = {\"de\": de_vocab, \"en\": en_vocab}\n", + "\n", + "SRC_LANGUAGE, TGT_LANGUAGE = \"de\", \"en\"\n", + "PAD_IDX = 1 # 앞서 설정하신 패딩 인덱스\n", + "\n", + "# 모델 생성 시 vocab 대신 vocab_transform 이름으로 참조하거나\n", + "# 바로 vocab을 사용하세요.\n", + "model = Seq2SeqTransformer(\n", + " num_encoder_layers=3,\n", + " num_decoder_layers=3,\n", + " emb_size=512,\n", + " max_len=512,\n", + " nhead=8,\n", + " src_vocab_size=len(vocab[SRC_LANGUAGE]), # ← 여기만 바뀜\n", + " tgt_vocab_size=len(vocab[TGT_LANGUAGE]), # ← 그리고 여기\n", + " dim_feedforward=512,\n", + ").to(DEVICE)\n", + "\n", + "criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(DEVICE)\n", + "optimizer = optim.Adam(model.parameters())\n", + "\n", + "for main_name, main_module in model.named_children():\n", + " print(main_name)\n", + " for sub_name, sub_module in main_module.named_children():\n", + " print(\"└\", sub_name)\n", + " for ssub_name, ssub_module in sub_module.named_children():\n", + " print(\"│ └\", ssub_name)\n", + " for sssub_name, sssub_module in ssub_module.named_children():\n", + " print(\"│ │ └\", sssub_name)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "source_batch shape: torch.Size([35, 128])\n", + "target_batch shape: torch.Size([30, 128])\n", + "example source indices:\n", + " tensor([[ 2, 2, 2, 2, 2],\n", + " [ 14, 5, 5, 21, 5],\n", + " [ 38, 12, 35, 31, 12],\n", + " [ 24, 281, 10, 957, 10],\n", + " [ 243, 7, 1205, 18, 7190],\n", + " [2744, 6, 32, 420, 2328],\n", + " [8680, 83, 11, 0, 8],\n", + " [ 11, 245, 34, 11, 16],\n", + " [ 20, 11, 792, 6, 18],\n", + " [ 892, 6, 13, 0, 362],\n", + " [ 3, 444, 17, 318, 3914],\n", + " [ 1, 4, 4, 11, 62],\n", + " [ 1, 3, 3, 3, 8],\n", + " [ 1, 1, 1, 1, 32],\n", + " [ 1, 1, 1, 1, 7],\n", + " [ 1, 1, 1, 1, 6],\n", + " [ 1, 1, 1, 1, 115],\n", + " [ 1, 1, 1, 1, 197],\n", + " [ 1, 1, 1, 1, 4],\n", + " [ 1, 1, 1, 1, 3],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1]])\n", + "example target indices:\n", + " tensor([[ 2, 2, 2, 2, 2],\n", + " [ 6, 6, 6, 19, 6],\n", + " [ 39, 12, 35, 36, 1489],\n", + " [ 13, 383, 21, 412, 12],\n", + " [ 36, 7, 759, 50, 21],\n", + " [ 17, 4, 96, 4, 4],\n", + " [1667, 51, 9, 30, 31],\n", + " [2541, 189, 4, 272, 748],\n", + " [ 342, 9, 16, 382, 85],\n", + " [ 4, 4, 106, 1923, 10],\n", + " [ 282, 439, 893, 9, 32],\n", + " [ 3, 5, 5, 28, 7],\n", + " [ 1, 3, 3, 5504, 4],\n", + " [ 1, 1, 1, 79, 77],\n", + " [ 1, 1, 1, 325, 184],\n", + " [ 1, 1, 1, 3, 5],\n", + " [ 1, 1, 1, 1, 3],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1],\n", + " [ 1, 1, 1, 1, 1]])\n" + ] + } + ], + "source": [ + "import requests\n", + "import gzip\n", + "import io\n", + "import math\n", + "import torch\n", + "from torch import nn\n", + "from torch.utils.data import DataLoader\n", + "from torch.nn.utils.rnn import pad_sequence\n", + "from torchtext.data.utils import get_tokenizer\n", + "from torchtext.vocab import build_vocab_from_iterator\n", + "\n", + "# === 0) 앞에서 정의한 download_and_read_gz 함수 재사용 ===\n", + "def download_and_read_gz(url):\n", + " resp = requests.get(url)\n", + " resp.raise_for_status()\n", + " with gzip.GzipFile(fileobj=io.BytesIO(resp.content)) as gz:\n", + " return [line.decode(\"utf-8\").strip() for line in gz]\n", + "\n", + "# === 1) train/valid 데이터 로드 ===\n", + "URLS = {\n", + " \"train\": {\n", + " \"de\": \"https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.de.gz\",\n", + " \"en\": \"https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.en.gz\",\n", + " },\n", + " \"valid\": {\n", + " \"de\": \"https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.de.gz\",\n", + " \"en\": \"https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.en.gz\",\n", + " }\n", + "}\n", + "\n", + "raw_data = {}\n", + "for split in [\"train\", \"valid\"]:\n", + " de = download_and_read_gz(URLS[split][\"de\"])\n", + " en = download_and_read_gz(URLS[split][\"en\"])\n", + " raw_data[split] = list(zip(de, en))\n", + "\n", + "raw_train = raw_data[\"train\"]\n", + "raw_valid = raw_data[\"valid\"]\n", + "\n", + "# === 2) 토크나이저 및 vocab 생성 ===\n", + "SRC, TGT = \"de\", \"en\"\n", + "BOS_IDX, EOS_IDX, PAD_IDX, UNK_IDX = 2, 3, 1, 0\n", + "specials = [\"\", \"\", \"\", \"\"]\n", + "\n", + "tokenizer = {\n", + " SRC: get_tokenizer(\"spacy\", language=\"de_core_news_sm\"),\n", + " TGT: get_tokenizer(\"spacy\", language=\"en_core_web_sm\"),\n", + "}\n", + "\n", + "def generate_tokens(data, lang):\n", + " idx = 0 if lang == SRC else 1\n", + " for src, tgt in data:\n", + " text = src if lang == SRC else tgt\n", + " yield tokenizer[lang](text)\n", + "\n", + "vocab = {}\n", + "for lang in [SRC, TGT]:\n", + " v = build_vocab_from_iterator(\n", + " generate_tokens(raw_train, lang),\n", + " min_freq=1,\n", + " specials=specials,\n", + " special_first=True,\n", + " )\n", + " v.set_default_index(UNK_IDX)\n", + " vocab[lang] = v\n", + "\n", + "# === 3) 텍스트 → 토큰ID 변환 트랜스폼 정의 ===\n", + "def sequential_transforms(*transforms):\n", + " def fn(txt):\n", + " for transform in transforms:\n", + " txt = transform(txt)\n", + " return txt\n", + " return fn\n", + "\n", + "def input_transform(token_ids):\n", + " return torch.cat([\n", + " torch.tensor([BOS_IDX]),\n", + " torch.tensor(token_ids),\n", + " torch.tensor([EOS_IDX])\n", + " ])\n", + "\n", + "text_transform = {}\n", + "for lang in [SRC, TGT]:\n", + " text_transform[lang] = sequential_transforms(\n", + " tokenizer[lang], # raw string → 리스트[str]\n", + " vocab[lang], # 리스트[str] → 리스트[int]\n", + " input_transform # 리스트[int] → Tensor[...]\n", + " )\n", + "\n", + "# === 4) collate_fn & DataLoader 생성 ===\n", + "def collator(batch):\n", + " src_batch, tgt_batch = [], []\n", + " for src_str, tgt_str in batch:\n", + " src_batch.append(text_transform[SRC](src_str))\n", + " tgt_batch.append(text_transform[TGT](tgt_str))\n", + " src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)\n", + " tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)\n", + " return src_batch, tgt_batch\n", + "\n", + "BATCH_SIZE = 128\n", + "valid_loader = DataLoader(raw_valid, batch_size=BATCH_SIZE, collate_fn=collator)\n", + "\n", + "# === 5) 한 배치 꺼내서 확인 ===\n", + "src_batch, tgt_batch = next(iter(valid_loader))\n", + "print(\"source_batch shape:\", src_batch.shape) # (seq_len, batch)\n", + "print(\"target_batch shape:\", tgt_batch.shape)\n", + "print(\"example source indices:\\n\", src_batch[:, :5])\n", + "print(\"example target indices:\\n\", tgt_batch[:, :5])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "src_mask: torch.Size([35, 35])\n", + "tensor([[0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.]])\n", + "tgt_mask: torch.Size([29, 29])\n", + "tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0., 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 0.,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,\n", + " 0., 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,\n", + " -inf, 0., 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,\n", + " -inf, -inf, 0., 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,\n", + " -inf, -inf, -inf, 0., 0.],\n", + " [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,\n", + " -inf, -inf, -inf, -inf, 0.]])\n", + "src_pad_mask: torch.Size([128, 35])\n", + "tensor([[False, False, False, ..., True, True, True],\n", + " [False, False, False, ..., True, True, True],\n", + " [False, False, False, ..., True, True, True],\n", + " ...,\n", + " [False, False, False, ..., True, True, True],\n", + " [False, False, False, ..., True, True, True],\n", + " [False, False, False, ..., True, True, True]])\n", + "tgt_pad_mask: torch.Size([128, 29])\n", + "tensor([[False, False, False, ..., True, True, True],\n", + " [False, False, False, ..., True, True, True],\n", + " [False, False, False, ..., True, True, True],\n", + " ...,\n", + " [False, False, False, ..., True, True, True],\n", + " [False, False, False, ..., True, True, True],\n", + " [False, False, False, ..., True, True, True]])\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "# 이미 정의된 DEVICE, PAD_IDX 사용\n", + "# DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "# PAD_IDX = 1\n", + "\n", + "def generate_square_subsequent_mask(sz):\n", + " mask = torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1\n", + " mask = (\n", + " mask.float()\n", + " .masked_fill(~mask, float(\"-inf\"))\n", + " .masked_fill(mask, float(0.0))\n", + " )\n", + " return mask\n", + "\n", + "def create_mask(src, tgt_input):\n", + " src_seq_len, _ = src.shape\n", + " tgt_seq_len, _ = tgt_input.shape\n", + "\n", + " # no future look-ahead for decoder\n", + " tgt_mask = generate_square_subsequent_mask(tgt_seq_len)\n", + " # encoder doesn't need causal mask\n", + " src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE)\n", + "\n", + " # padding masks: shape (batch, seq_len)\n", + " src_padding_mask = (src == PAD_IDX).transpose(0, 1)\n", + " tgt_padding_mask = (tgt_input == PAD_IDX).transpose(0, 1)\n", + " return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask\n", + "\n", + "# 예시: valid_loader 에서 배치 꺼내기\n", + "src_batch, tgt_batch = next(iter(valid_loader)) # src_batch: (S, B), tgt_batch: (T, B)\n", + "\n", + "# decoder input은 부터 마지막 전까지\n", + "tgt_input = tgt_batch[:-1, :]\n", + "# 예측 대상(teacher forcing)은 두 번째 토큰부터 끝까지\n", + "tgt_out = tgt_batch[1:, :]\n", + "\n", + "src_mask, tgt_mask, src_pad_mask, tgt_pad_mask = create_mask(src_batch, tgt_input)\n", + "\n", + "print(\"src_mask:\", src_mask.shape)\n", + "print(src_mask)\n", + "\n", + "print(\"tgt_mask:\", tgt_mask.shape)\n", + "print(tgt_mask)\n", + "\n", + "print(\"src_pad_mask:\", src_pad_mask.shape)\n", + "print(src_pad_mask)\n", + "\n", + "print(\"tgt_pad_mask:\", tgt_pad_mask.shape)\n", + "print(tgt_pad_mask)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "577080f70b824c5b905db5a3e9cef87f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/665 [00:00= 2.1 is required but found 2.0.1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train Dataset Length : 8550\n", + "Valid Dataset Length : 526\n", + "Test Dataset Length : 515\n" + ] + } + ], + "source": [ + "import torch\n", + "from torchtext.datasets import CoLA\n", + "from transformers import AutoTokenizer\n", + "from torch.utils.data import DataLoader\n", + "\n", + "\n", + "def collator(batch, tokenizer, device):\n", + " source, labels, texts = zip(*batch)\n", + " tokenized = tokenizer(\n", + " texts,\n", + " padding=\"longest\",\n", + " truncation=True,\n", + " return_tensors=\"pt\"\n", + " )\n", + " input_ids = tokenized[\"input_ids\"].to(device)\n", + " attention_mask = tokenized[\"attention_mask\"].to(device)\n", + " labels = torch.tensor(labels, dtype=torch.long).to(device)\n", + " return input_ids, attention_mask, labels\n", + "\n", + "\n", + "train_data = list(CoLA(split=\"train\"))\n", + "valid_data = list(CoLA(split=\"dev\"))\n", + "test_data = list(CoLA(split=\"test\"))\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + "epochs = 3\n", + "batch_size = 16\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "train_dataloader = DataLoader(\n", + " train_data,\n", + " batch_size=batch_size,\n", + " collate_fn=lambda x: collator(x, tokenizer, device),\n", + " shuffle=True,\n", + ")\n", + "valid_dataloader = DataLoader(\n", + " valid_data, batch_size=batch_size, collate_fn=lambda x: collator(x, tokenizer, device)\n", + ")\n", + "test_dataloader = DataLoader(\n", + " test_data, batch_size=batch_size, collate_fn=lambda x: collator(x, tokenizer, device)\n", + ")\n", + "\n", + "print(\"Train Dataset Length :\", len(train_data))\n", + "print(\"Valid Dataset Length :\", len(valid_data))\n", + "print(\"Test Dataset Length :\", len(test_data))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ], + "source": [ + "import torch\n", + "from torch import optim\n", + "from transformers import GPT2ForSequenceClassification\n", + "\n", + "# 1) device 정의\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "# 2) 모델 로드 및 디바이스 이동\n", + "model = GPT2ForSequenceClassification.from_pretrained(\n", + " pretrained_model_name_or_path=\"gpt2\",\n", + " num_labels=2\n", + ").to(device)\n", + "\n", + "# 3) 패딩 토큰 설정\n", + "model.config.pad_token_id = model.config.eos_token_id\n", + "\n", + "# 4) 옵티마이저 준비\n", + "optimizer = optim.Adam(model.parameters(), lr=5e-5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from torch import nn\n", + "\n", + "\n", + "def calc_accuracy(preds, labels):\n", + " pred_flat = np.argmax(preds, axis=1).flatten()\n", + " labels_flat = labels.flatten()\n", + " return np.sum(pred_flat == labels_flat) / len(labels_flat)\n", + "\n", + "def train(model, optimizer, dataloader):\n", + " model.train()\n", + " train_loss = 0.0\n", + "\n", + " for input_ids, attention_mask, labels in dataloader:\n", + " outputs = model(\n", + " input_ids=input_ids,\n", + " attention_mask=attention_mask,\n", + " labels=labels\n", + " )\n", + "\n", + " loss = outputs.loss\n", + " train_loss += loss.item()\n", + " \n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " train_loss = train_loss / len(dataloader)\n", + " return train_loss\n", + "\n", + "def evaluation(model, dataloader):\n", + " with torch.no_grad():\n", + " model.eval()\n", + " val_loss, val_accuracy = 0.0, 0.0\n", + " \n", + " for input_ids, attention_mask, labels in dataloader:\n", + " outputs = model(\n", + " input_ids=input_ids,\n", + " attention_mask=attention_mask,\n", + " labels=labels\n", + " )\n", + " logits = outputs.logits\n", + " loss = outputs.loss\n", + "\n", + " logits = logits.detach().cpu().numpy()\n", + " label_ids = labels.to(\"cpu\").numpy()\n", + " accuracy = calc_accuracy(logits, label_ids)\n", + " \n", + " val_loss += loss.item()\n", + " val_accuracy += accuracy\n", + " \n", + " val_loss = val_loss/len(dataloader)\n", + " val_accuracy = val_accuracy/len(dataloader)\n", + " return val_loss, val_accuracy\n", + "\n", + "\n", + "best_loss = 10000\n", + "for epoch in range(epochs):\n", + " train_loss = train(model, optimizer, train_dataloader)\n", + " val_loss, val_accuracy = evaluation(model, valid_dataloader)\n", + " print(f\"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val Accuracy {val_accuracy:.4f}\")\n", + "\n", + " if val_loss < best_loss:\n", + " best_loss = val_loss\n", + " torch.save(model.state_dict(), \"../models/GPT2ForSequenceClassification.pt\")\n", + " print(\"Saved the model weights\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = GPT2ForSequenceClassification.from_pretrained(\n", + " pretrained_model_name_or_path=\"gpt2\",\n", + " num_labels=2\n", + ").to(device)\n", + "model.config.pad_token_id = model.config.eos_token_id\n", + "model.load_state_dict(torch.load(\"../models/GPT2ForSequenceClassification.pt\"))\n", + "\n", + "test_loss, test_accuracy = evaluation(model, test_dataloader)\n", + "print(f\"Test Loss : {test_loss:.4f}\")\n", + "print(f\"Test Accuracy : {test_accuracy:.4f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}