DEV: Use structured responses for summaries

romanrizzi · romanrizzi · commit 7f7cdc36fc9f · 2025-04-08T13:57:53.000-03:00
diff --git a/lib/completions/llm.rb b/lib/completions/llm.rb
@@ -241,6 +241,7 @@ def initialize(dialect_klass, gateway_klass, llm_model, gateway: nil)
       # @param feature_context { Hash - Optional } - The feature context to use for the completion.
       # @param partial_tool_calls { Boolean - Optional } - If true, the completion will return partial tool calls.
       # @param output_thinking { Boolean - Optional } - If true, the completion will return the thinking output for thinking models.
+      # @param extra_model_params { Hash - Optional } - Other params that are not available accross models. e.g. response_format JSON schema.
       #
       # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
       #
diff --git a/lib/personas/bot.rb b/lib/personas/bot.rb
@@ -64,10 +64,10 @@ def reply(context, llm_args: {}, &update_blk)
 
         user = context.user
 
-        llm_kwargs = { user: user }
+        llm_kwargs = llm_args.dup
+        llm_kwargs[:user] = user
         llm_kwargs[:temperature] = persona.temperature if persona.temperature
         llm_kwargs[:top_p] = persona.top_p if persona.top_p
-        llm_kwargs[:max_tokens] = llm_args[:max_tokens] if llm_args[:max_tokens].present?
 
         needs_newlines = false
         tools_ran = 0
diff --git a/lib/personas/short_summarizer.rb b/lib/personas/short_summarizer.rb
@@ -18,7 +18,13 @@ def system_prompt
           - Limit the summary to a maximum of 40 words.
           - Do *NOT* repeat the discussion title in the summary.
 
-          Return the summary inside <ai></ai> tags.
+          Format your response as a JSON object with a single key named "summary", which has the summary as the value.
+          Your output should be in the following format:
+            <output>
+              {"summary": "xx"}
+            </output>
+          
+          Where "xx" is replaced by the summary.
         PROMPT
       end
     end
diff --git a/lib/personas/summarizer.rb b/lib/personas/summarizer.rb
@@ -18,6 +18,14 @@ def system_prompt
           - Example: link to the 6th post by jane: [agreed with]({resource_url}/6)
           - Example: link to the 13th post by joe: [joe]({resource_url}/13)
           - When formatting usernames either use @USERNAME OR [USERNAME]({resource_url}/POST_NUMBER)
+          
+          Format your response as a JSON object with a single key named "summary", which has the summary as the value.
+          Your output should be in the following format:
+            <output>
+              {"summary": "xx"}
+            </output>
+          
+          Where "xx" is replaced by the summary.
         PROMPT
       end
     end
diff --git a/lib/summarization/fold_content.rb b/lib/summarization/fold_content.rb
@@ -27,20 +27,14 @@ def initialize(bot, strategy, persist_summaries: true)
       def summarize(user, &on_partial_blk)
         truncated_content = content_to_summarize.map { |cts| truncate(cts) }
 
-        summary = fold(truncated_content, user, &on_partial_blk)
-
-        clean_summary = Nokogiri::HTML5.fragment(summary).css("ai")&.first&.text || summary
+        # Done here to cover non-streaming mode.
+        json_reply_end = "\"}"
+        summary = fold(truncated_content, user, &on_partial_blk).chomp(json_reply_end)
 
         if persist_summaries
-          AiSummary.store!(
-            strategy,
-            llm_model,
-            clean_summary,
-            truncated_content,
-            human: user&.human?,
-          )
+          AiSummary.store!(strategy, llm_model, summary, truncated_content, human: user&.human?)
         else
-          AiSummary.new(summarized_text: clean_summary)
+          AiSummary.new(summarized_text: summary)
         end
       end
 
@@ -118,17 +112,40 @@ def fold(items, user, &on_partial_blk)
           )
 
         summary = +""
+        # Auxiliary variables to get the summary content from the JSON response.
+        raw_buffer = +""
+        json_start_found = false
+        json_reply_start_regex = /\{\s*"summary"\s*:\s*"/
+        unescape_regex = %r{\\(["/bfnrt])}
+        json_reply_end = "\"}"
+
         buffer_blk =
-          Proc.new do |partial, cancel, placeholder, type|
+          Proc.new do |partial, cancel, _, type|
             if type.blank?
-              summary << partial
-              on_partial_blk.call(partial, cancel) if on_partial_blk
+              if json_start_found
+                unescaped_partial = partial.gsub(unescape_regex, '\1')
+                summary << unescaped_partial
+
+                on_partial_blk.call(partial, cancel) if on_partial_blk
+              else
+                raw_buffer << partial
+
+                if raw_buffer.match?(json_reply_start_regex)
+                  buffered_start =
+                    raw_buffer.gsub(json_reply_start_regex, "").gsub(unescape_regex, '\1')
+                  summary << buffered_start
+
+                  on_partial_blk.call(buffered_start, cancel) if on_partial_blk
+
+                  json_start_found = true
+                end
+              end
             end
           end
 
-        bot.reply(context, &buffer_blk)
+        bot.reply(context, llm_args: { extra_model_params: response_format }, &buffer_blk)
 
-        summary
+        summary.chomp(json_reply_end)
       end
 
       def available_tokens
@@ -155,10 +172,26 @@ def truncate(item)
         item
       end
 
-      def text_only_update(&on_partial_blk)
-        Proc.new do |partial, cancel, placeholder, type|
-          on_partial_blk.call(partial, cancel) if type.blank?
-        end
+      def response_format
+        {
+          response_format: {
+            type: "json_schema",
+            json_schema: {
+              name: "reply",
+              schema: {
+                type: "object",
+                properties: {
+                  summary: {
+                    type: "string",
+                  },
+                },
+                required: ["summary"],
+                additionalProperties: false,
+              },
+              strict: true,
+            },
+          },
+        }
       end
     end
   end
diff --git a/spec/jobs/regular/fast_track_topic_gist_spec.rb b/spec/jobs/regular/fast_track_topic_gist_spec.rb
@@ -21,11 +21,16 @@
           created_at: 10.minutes.ago,
         )
       end
+
       let(:updated_gist) { "They updated me :(" }
 
+      def in_json_format(summary)
+        "{\"summary\":\"#{summary}\"}"
+      end
+
       context "when it's up to date" do
         it "does nothing" do
-          DiscourseAi::Completions::Llm.with_prepared_responses([updated_gist]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([in_json_format(updated_gist)]) do
             subject.execute(topic_id: topic_1.id)
           end
 
@@ -39,7 +44,7 @@
         before { Fabricate(:post, topic: topic_1, post_number: 3) }
 
         it "regenerates the gist using the latest data" do
-          DiscourseAi::Completions::Llm.with_prepared_responses([updated_gist]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([in_json_format(updated_gist)]) do
             subject.execute(topic_id: topic_1.id)
           end
 
@@ -52,7 +57,7 @@
         it "does nothing if the gist was created less than 5 minutes ago" do
           ai_gist.update!(created_at: 2.minutes.ago)
 
-          DiscourseAi::Completions::Llm.with_prepared_responses([updated_gist]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([in_json_format(updated_gist)]) do
             subject.execute(topic_id: topic_1.id)
           end
 
diff --git a/spec/jobs/regular/stream_topic_ai_summary_spec.rb b/spec/jobs/regular/stream_topic_ai_summary_spec.rb
@@ -50,21 +50,31 @@ def with_responses(responses)
       end
     end
 
+    def in_json_format(summary)
+      "{\"summary\":\"#{summary}\"}"
+    end
+
     it "publishes updates with a partial summary" do
-      with_responses(["dummy"]) do
+      summary = "dummy"
+
+      with_responses([in_json_format(summary)]) do
         messages =
           MessageBus.track_publish("/discourse-ai/summaries/topic/#{topic.id}") do
             job.execute(topic_id: topic.id, user_id: user.id)
           end
 
         partial_summary_update = messages.first.data
         expect(partial_summary_update[:done]).to eq(false)
-        expect(partial_summary_update.dig(:ai_topic_summary, :summarized_text)).to eq("dummy")
+        expect(partial_summary_update.dig(:ai_topic_summary, :summarized_text).chomp("\"}")).to eq(
+          summary,
+        )
       end
     end
 
     it "publishes a final update to signal we're done and provide metadata" do
-      with_responses(["dummy"]) do
+      summary = "dummy"
+
+      with_responses([in_json_format(summary)]) do
         messages =
           MessageBus.track_publish("/discourse-ai/summaries/topic/#{topic.id}") do
             job.execute(topic_id: topic.id, user_id: user.id)
@@ -73,6 +83,7 @@ def with_responses(responses)
         final_update = messages.last.data
         expect(final_update[:done]).to eq(true)
 
+        expect(final_update.dig(:ai_topic_summary, :summarized_text)).to eq(summary)
         expect(final_update.dig(:ai_topic_summary, :algorithm)).to eq("fake")
         expect(final_update.dig(:ai_topic_summary, :outdated)).to eq(false)
         expect(final_update.dig(:ai_topic_summary, :can_regenerate)).to eq(true)
diff --git a/spec/jobs/scheduled/summaries_backfill_spec.rb b/spec/jobs/scheduled/summaries_backfill_spec.rb
@@ -84,6 +84,10 @@
     end
   end
 
+  def in_json_format(summary)
+    "{\"summary\":\"#{summary}\"}"
+  end
+
   describe "#execute" do
     it "backfills a batch" do
       topic_2 =
@@ -98,7 +102,7 @@
       gist_2 = "Updated gist of topic"
 
       DiscourseAi::Completions::Llm.with_prepared_responses(
-        [gist_1, gist_2, summary_1, summary_2],
+        [gist_1, gist_2, summary_1, summary_2].map { |s| in_json_format(s) },
       ) { subject.execute({}) }
 
       expect(AiSummary.complete.find_by(target: topic_2).summarized_text).to eq(summary_1)
diff --git a/spec/lib/modules/summarization/fold_content_spec.rb b/spec/lib/modules/summarization/fold_content_spec.rb
@@ -26,7 +26,8 @@
       llm_model.update!(max_prompt_tokens: model_tokens)
     end
 
-    let(:single_summary) { "this is a summary" }
+    let(:single_summary) { "{\"summary\":\"#{clean_summary}\"}" }
+    let(:clean_summary) { "this is a summary" }
 
     fab!(:user)
 
@@ -36,7 +37,7 @@
           summarizer.summarize(user).tap { expect(spy.completions).to eq(1) }
         end
 
-      expect(result.summarized_text).to eq(single_summary)
+      expect(result.summarized_text).to eq(clean_summary)
     end
   end
 
diff --git a/spec/requests/summarization/summary_controller_spec.rb b/spec/requests/summarization/summary_controller_spec.rb
@@ -73,15 +73,16 @@
       end
 
       it "returns a summary" do
-        summary_text = "This is a summary"
+        clean_summary = "This is a summary"
+        summary_text = "{\"summary\":\"#{clean_summary}\"}"
         DiscourseAi::Completions::Llm.with_prepared_responses([summary_text]) do
           get "/discourse-ai/summarization/t/#{topic.id}.json"
 
           expect(response.status).to eq(200)
           response_summary = response.parsed_body["ai_topic_summary"]
           summary = AiSummary.last
 
-          expect(summary.summarized_text).to eq(summary_text)
+          expect(summary.summarized_text).to eq(clean_summary)
           expect(response_summary["summarized_text"]).to eq(summary.summarized_text)
           expect(response_summary["algorithm"]).to eq("fake")
           expect(response_summary["outdated"]).to eq(false)
diff --git a/spec/services/discourse_ai/topic_summarization_spec.rb b/spec/services/discourse_ai/topic_summarization_spec.rb
@@ -13,6 +13,9 @@
 
   let(:strategy) { DiscourseAi::Summarization.topic_summary(topic) }
 
+  let(:raw_summary) { "{\"summary\":\"#{clean_summary}\"}" }
+  let(:clean_summary) { "This is the final summary" }
+
   describe "#summarize" do
     subject(:summarization) { described_class.new(strategy, user) }
 
@@ -30,10 +33,10 @@ def assert_summary_is_cached(topic, summary_response)
       let(:summary) { "This is the final summary" }
 
       it "caches the summary" do
-        DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do
+        DiscourseAi::Completions::Llm.with_prepared_responses([raw_summary]) do
           section = summarization.summarize
-          expect(section.summarized_text).to eq(summary)
-          assert_summary_is_cached(topic, summary)
+          expect(section.summarized_text).to eq(clean_summary)
+          assert_summary_is_cached(topic, clean_summary)
         end
       end
 
@@ -54,7 +57,6 @@ def assert_summary_is_cached(topic, summary_response)
 
     describe "invalidating cached summaries" do
       let(:cached_text) { "This is a cached summary" }
-      let(:updated_summary) { "This is the final summary" }
 
       def cached_summary
         AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete])
@@ -65,7 +67,9 @@ def cached_summary
         # once it is cached with_prepared_responses will not work as expected
         # since it is glued to the old llm instance
         # so we create the cached summary totally independantly
-        DiscourseAi::Completions::Llm.with_prepared_responses([cached_text]) do
+        DiscourseAi::Completions::Llm.with_prepared_responses(
+          ["{\"summary\": \"#{cached_text}\"}"],
+        ) do
           strategy = DiscourseAi::Summarization.topic_summary(topic)
           described_class.new(strategy, user).summarize
         end
@@ -86,10 +90,10 @@ def cached_summary
           before { cached_summary.update!(original_content_sha: "outdated_sha") }
 
           it "returns a new summary" do
-            DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do
+            DiscourseAi::Completions::Llm.with_prepared_responses([raw_summary]) do
               section = summarization.summarize
 
-              expect(section.summarized_text).to eq(updated_summary)
+              expect(section.summarized_text).to eq(clean_summary)
             end
           end
 
@@ -106,10 +110,10 @@ def cached_summary
             end
 
             it "returns a new summary if the skip_age_check flag is passed" do
-              DiscourseAi::Completions::Llm.with_prepared_responses([updated_summary]) do
+              DiscourseAi::Completions::Llm.with_prepared_responses([raw_summary]) do
                 section = summarization.summarize(skip_age_check: true)
 
-                expect(section.summarized_text).to eq(updated_summary)
+                expect(section.summarized_text).to eq(clean_summary)
               end
             end
           end
@@ -118,16 +122,15 @@ def cached_summary
     end
 
     describe "stream partial updates" do
-      let(:summary) { "This is the final summary" }
-
       it "receives a blk that is passed to the underlying strategy and called with partial summaries" do
         partial_result = +""
 
-        DiscourseAi::Completions::Llm.with_prepared_responses([summary]) do
+        DiscourseAi::Completions::Llm.with_prepared_responses([raw_summary]) do
           summarization.summarize { |partial_summary| partial_result << partial_summary }
         end
 
-        expect(partial_result).to eq(summary)
+        # In a real world example, this is removed in the returned AiSummary obj.
+        expect(partial_result.chomp("\"}")).to eq(clean_summary)
       end
     end
   end
diff --git a/spec/system/summarization/topic_summarization_spec.rb b/spec/system/summarization/topic_summarization_spec.rb
@@ -12,11 +12,13 @@
         "I like to eat pie. It is a very good dessert. Some people are wasteful by throwing pie at others but I do not do that. I always eat the pie.",
     )
   end
-  let(:summarization_result) { "This is a summary" }
+
+  let(:clean_summary) { "This is a summary" }
+
   let(:topic_page) { PageObjects::Pages::Topic.new }
   let(:summary_box) { PageObjects::Components::AiSummaryTrigger.new }
 
-  fab!(:ai_summary) { Fabricate(:ai_summary, target: topic, summarized_text: "This is a summary") }
+  fab!(:ai_summary) { Fabricate(:ai_summary, target: topic, summarized_text: clean_summary) }
 
   before do
     group.add(current_user)

Original file line number	Diff line number	Diff line change
`@@ -241,6 +241,7 @@ def initialize(dialect_klass, gateway_klass, llm_model, gateway: nil)`
`241`	`241`	`# @param feature_context { Hash - Optional } - The feature context to use for the completion.`
`242`	`242`	`# @param partial_tool_calls { Boolean - Optional } - If true, the completion will return partial tool calls.`
`243`	`243`	`# @param output_thinking { Boolean - Optional } - If true, the completion will return the thinking output for thinking models.`
	`244`	`+ # @param extra_model_params { Hash - Optional } - Other params that are not available accross models. e.g. response_format JSON schema.`
`244`	`245`	`#`
`245`	`246`	`# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.`
`246`	`247`	`#`